diff --git a/public/images/docs/evaluation/custom/agents-tab.png b/public/images/docs/evaluation/custom/agents-tab.png new file mode 100644 index 00000000..eb564e6a Binary files /dev/null and b/public/images/docs/evaluation/custom/agents-tab.png differ diff --git a/public/images/docs/evaluation/custom/llm-judge-tab.png b/public/images/docs/evaluation/custom/llm-judge-tab.png new file mode 100644 index 00000000..4b2e8b4f Binary files /dev/null and b/public/images/docs/evaluation/custom/llm-judge-tab.png differ diff --git a/public/images/docs/evaluation/custom/output-type-scoring.png b/public/images/docs/evaluation/custom/output-type-scoring.png new file mode 100644 index 00000000..3e8b675c Binary files /dev/null and b/public/images/docs/evaluation/custom/output-type-scoring.png differ diff --git a/public/images/docs/evaluation/error-localization/toggle-on.png b/public/images/docs/evaluation/error-localization/toggle-on.png new file mode 100644 index 00000000..bd5c83c4 Binary files /dev/null and b/public/images/docs/evaluation/error-localization/toggle-on.png differ diff --git a/public/images/docs/evaluation/evaluate/add-evaluation.png b/public/images/docs/evaluation/evaluate/add-evaluation.png new file mode 100644 index 00000000..4e332328 Binary files /dev/null and b/public/images/docs/evaluation/evaluate/add-evaluation.png differ diff --git a/public/images/docs/evaluation/evaluate/configured-evals-panel.png b/public/images/docs/evaluation/evaluate/configured-evals-panel.png new file mode 100644 index 00000000..159b4634 Binary files /dev/null and b/public/images/docs/evaluation/evaluate/configured-evals-panel.png differ diff --git a/public/images/docs/evaluation/evaluate/dataset-page.png b/public/images/docs/evaluation/evaluate/dataset-page.png new file mode 100644 index 00000000..6e4cc6b7 Binary files /dev/null and b/public/images/docs/evaluation/evaluate/dataset-page.png differ diff --git a/public/images/docs/evaluation/evaluate/dataset-with-results-running.png b/public/images/docs/evaluation/evaluate/dataset-with-results-running.png new file mode 100644 index 00000000..e55b8f6e Binary files /dev/null and b/public/images/docs/evaluation/evaluate/dataset-with-results-running.png differ diff --git a/public/images/docs/evaluation/ground-truth/map-variables.png b/public/images/docs/evaluation/ground-truth/map-variables.png new file mode 100644 index 00000000..419bb361 Binary files /dev/null and b/public/images/docs/evaluation/ground-truth/map-variables.png differ diff --git a/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png b/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png new file mode 100644 index 00000000..0ba2e4fa Binary files /dev/null and b/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png differ diff --git a/public/images/docs/evaluation/ground-truth/tab-selected.png b/public/images/docs/evaluation/ground-truth/tab-selected.png new file mode 100644 index 00000000..e1d446eb Binary files /dev/null and b/public/images/docs/evaluation/ground-truth/tab-selected.png differ diff --git a/public/images/docs/evaluation/test-playground/custom-tab.png b/public/images/docs/evaluation/test-playground/custom-tab.png new file mode 100644 index 00000000..98dc2630 Binary files /dev/null and b/public/images/docs/evaluation/test-playground/custom-tab.png differ diff --git a/public/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png b/public/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png new file mode 100644 index 00000000..ab547590 Binary files /dev/null and b/public/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png differ diff --git a/public/images/docs/observe/evals/editing-eval-in-tasks-page.png b/public/images/docs/observe/evals/editing-eval-in-tasks-page.png new file mode 100644 index 00000000..0c70fc50 Binary files /dev/null and b/public/images/docs/observe/evals/editing-eval-in-tasks-page.png differ diff --git a/public/images/docs/observe/evals/evals-results-observe-page.png b/public/images/docs/observe/evals/evals-results-observe-page.png new file mode 100644 index 00000000..41610065 Binary files /dev/null and b/public/images/docs/observe/evals/evals-results-observe-page.png differ diff --git a/public/images/docs/observe/evals/task-config-page.png b/public/images/docs/observe/evals/task-config-page.png new file mode 100644 index 00000000..b67332ba Binary files /dev/null and b/public/images/docs/observe/evals/task-config-page.png differ diff --git a/public/images/docs/simulation/add-evaluation-button.png b/public/images/docs/simulation/add-evaluation-button.png new file mode 100644 index 00000000..6f27ba4d Binary files /dev/null and b/public/images/docs/simulation/add-evaluation-button.png differ diff --git a/public/images/docs/simulation/eval-results-page.png b/public/images/docs/simulation/eval-results-page.png new file mode 100644 index 00000000..6c9e2b36 Binary files /dev/null and b/public/images/docs/simulation/eval-results-page.png differ diff --git a/public/images/docs/simulation/map-variables-for-evals.png b/public/images/docs/simulation/map-variables-for-evals.png new file mode 100644 index 00000000..66bac47a Binary files /dev/null and b/public/images/docs/simulation/map-variables-for-evals.png differ diff --git a/public/images/docs/simulation/search-evals.png b/public/images/docs/simulation/search-evals.png new file mode 100644 index 00000000..9fa8e46c Binary files /dev/null and b/public/images/docs/simulation/search-evals.png differ diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts index f8c26b39..ee87594b 100644 --- a/src/lib/navigation.ts +++ b/src/lib/navigation.ts @@ -295,6 +295,10 @@ export const tabNavigation: NavTab[] = [ { title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' }, { title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' }, { title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' }, + { title: 'Output Types', href: '/docs/evaluation/concepts/output-types' }, + { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' }, + { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' }, + { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' }, { title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' }, { title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' }, ] @@ -305,6 +309,9 @@ export const tabNavigation: NavTab[] = [ { title: 'Built-in Evals', href: '/docs/evaluation/builtin' }, { title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' }, { title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' }, + { title: 'Test Playground', href: '/docs/evaluation/features/test-playground' }, + { title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' }, + { title: 'Error Localization', href: '/docs/evaluation/features/error-localization' }, { title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' }, { title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' }, { title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' }, diff --git a/src/pages/docs/evaluation/builtin/accuracy.mdx b/src/pages/docs/evaluation/builtin/accuracy.mdx new file mode 100644 index 00000000..88c9907d --- /dev/null +++ b/src/pages/docs/evaluation/builtin/accuracy.mdx @@ -0,0 +1,53 @@ +--- +title: "Accuracy: Built-in Evaluation" +description: "Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comp..." +--- + +Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comparison. + + + +```python Python +result = evaluator.evaluate( + eval_templates="accuracy", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "accuracy", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/answer-similarity.mdx b/src/pages/docs/evaluation/builtin/answer-similarity.mdx new file mode 100644 index 00000000..ed52bf85 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/answer-similarity.mdx @@ -0,0 +1,53 @@ +--- +title: "Answer Similarity: Built-in Evaluation" +description: "Evaluates the similarity between the expected and actual responses" +--- + +Evaluates the similarity between the expected and actual responses. + + + +```python Python +result = evaluator.evaluate( + eval_templates="answer_similarity", + inputs={ + "expected_response": "...", + "response": "..." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "answer_similarity", + { + expected_response: "...", + response: "..." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `expected_response` | `string` | The expected response. | +| | `response` | `string` | The response. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/api-call.mdx b/src/pages/docs/evaluation/builtin/api-call.mdx new file mode 100644 index 00000000..6ddd44f9 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/api-call.mdx @@ -0,0 +1,50 @@ +--- +title: "Api Call: Built-in Evaluation" +description: "Makes an API call and evaluates the response" +--- + +Makes an API call and evaluates the response. + + + +```python Python +result = evaluator.evaluate( + eval_templates="api_call", + inputs={ + "response": "..." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "api_call", + { + response: "..." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `response` | `string` | The response. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Code`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/balanced-accuracy.mdx b/src/pages/docs/evaluation/builtin/balanced-accuracy.mdx new file mode 100644 index 00000000..28da4a92 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/balanced-accuracy.mdx @@ -0,0 +1,53 @@ +--- +title: "Balanced Accuracy: Built-in Evaluation" +description: "Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy" +--- + +Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. + + + +```python Python +result = evaluator.evaluate( + eval_templates="balanced_accuracy", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "balanced_accuracy", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/bleu.mdx b/src/pages/docs/evaluation/builtin/bleu.mdx index 385153e9..8bc276f9 100644 --- a/src/pages/docs/evaluation/builtin/bleu.mdx +++ b/src/pages/docs/evaluation/builtin/bleu.mdx @@ -42,8 +42,8 @@ console.log(result); | **Input** | | | | | ------ | --------- | ---- | ----------- | | | **Required Input** | **Type** | **Description** | -| | `reference` | `string` | Model-generated output to be evaluated. | -| | `hypothesis` | `string` or `List[string]` | One or more reference texts. | +| | `reference` | `string` | The reference / ground-truth text the output is being compared against. | +| | `hypothesis` | `string` | The model-generated output being evaluated. | | **Output** | | | | ------ | ----- | ----------- | diff --git a/src/pages/docs/evaluation/builtin/character-error-rate.mdx b/src/pages/docs/evaluation/builtin/character-error-rate.mdx new file mode 100644 index 00000000..7457d0a0 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/character-error-rate.mdx @@ -0,0 +1,53 @@ +--- +title: "Character Error Rate: Built-in Evaluation" +description: "Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as sco..." +--- + +Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as score (higher=better). + + + +```python Python +result = evaluator.evaluate( + eval_templates="character_error_rate", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "character_error_rate", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Audio` diff --git a/src/pages/docs/evaluation/builtin/chrf-score.mdx b/src/pages/docs/evaluation/builtin/chrf-score.mdx new file mode 100644 index 00000000..019e2ca3 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/chrf-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Chrf Score: Built-in Evaluation" +description: "Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to ..." +--- + +Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to order 6 with recall-weighted F-score. + + + +```python Python +result = evaluator.evaluate( + eval_templates="chrf_score", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "chrf_score", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/code-bleu.mdx b/src/pages/docs/evaluation/builtin/code-bleu.mdx new file mode 100644 index 00000000..c11bc6f3 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/code-bleu.mdx @@ -0,0 +1,53 @@ +--- +title: "Code Bleu: Built-in Evaluation" +description: "Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific tokens (def, class, return, if, for,..." +--- + +Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific tokens (def, class, return, if, for, etc). Better than standard BLEU for evaluating code generation. + + + +```python Python +result = evaluator.evaluate( + eval_templates="code_bleu", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "code_bleu", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Code`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/code-complexity.mdx b/src/pages/docs/evaluation/builtin/code-complexity.mdx new file mode 100644 index 00000000..c56916a6 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/code-complexity.mdx @@ -0,0 +1,50 @@ +--- +title: "Code Complexity: Built-in Evaluation" +description: "Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, boolean ops). Lower complexity = higher sco..." +--- + +Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, boolean ops). Lower complexity = higher score. Useful for code quality evaluation. + + + +```python Python +result = evaluator.evaluate( + eval_templates="code_complexity", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "code_complexity", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Code`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/cohen-kappa.mdx b/src/pages/docs/evaluation/builtin/cohen-kappa.mdx new file mode 100644 index 00000000..389387ad --- /dev/null +++ b/src/pages/docs/evaluation/builtin/cohen-kappa.mdx @@ -0,0 +1,53 @@ +--- +title: "Cohen Kappa: Built-in Evaluation" +description: "Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range -1 to 1, normalized to 0-1 for scoring. Usefu..." +--- + +Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range -1 to 1, normalized to 0-1 for scoring. Useful for classification evaluation with imbalanced classes. + + + +```python Python +result = evaluator.evaluate( + eval_templates="cohen_kappa", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "cohen_kappa", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/contains-all.mdx b/src/pages/docs/evaluation/builtin/contains-all.mdx new file mode 100644 index 00000000..bab507f8 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/contains-all.mdx @@ -0,0 +1,50 @@ +--- +title: "Contains All: Built-in Evaluation" +description: "Verifies text contains all specified keywords" +--- + +Verifies text contains all specified keywords. + + + +```python Python +result = evaluator.evaluate( + eval_templates="contains_all", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "contains_all", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/contains-any.mdx b/src/pages/docs/evaluation/builtin/contains-any.mdx new file mode 100644 index 00000000..cf286822 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/contains-any.mdx @@ -0,0 +1,50 @@ +--- +title: "Contains Any: Built-in Evaluation" +description: "Checks if the text contains any of the specified keywords" +--- + +Checks if the text contains any of the specified keywords. + + + +```python Python +result = evaluator.evaluate( + eval_templates="contains_any", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "contains_any", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/contains-none.mdx b/src/pages/docs/evaluation/builtin/contains-none.mdx new file mode 100644 index 00000000..efc429a3 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/contains-none.mdx @@ -0,0 +1,50 @@ +--- +title: "Contains None: Built-in Evaluation" +description: "Verifies text contains none of specified terms" +--- + +Verifies text contains none of specified terms. + + + +```python Python +result = evaluator.evaluate( + eval_templates="contains_none", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "contains_none", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/content-moderation.mdx b/src/pages/docs/evaluation/builtin/content-moderation.mdx new file mode 100644 index 00000000..2611d7f5 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/content-moderation.mdx @@ -0,0 +1,54 @@ +--- +title: "Content Moderation: Built-in Evaluation" +description: "Uses content moderation to evaluate content safety" +--- + +Uses content moderation to evaluate content safety. + + + +```python Python +result = evaluator.evaluate( + eval_templates="content_moderation", + inputs={ + "output": "The capital of France is Paris." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "content_moderation", + { + output: "The capital of France is Paris." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Red Teaming`, `Safety`, `Harmful Objects` diff --git a/src/pages/docs/evaluation/builtin/content-safety-violation.mdx b/src/pages/docs/evaluation/builtin/content-safety-violation.mdx new file mode 100644 index 00000000..2ee6a4f1 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/content-safety-violation.mdx @@ -0,0 +1,54 @@ +--- +title: "Content Safety Violation: Built-in Evaluation" +description: "A broad check for content that violates safety or usage policies—this includes toxicity, hate speech, explicit content, violence, etc" +--- + +A broad check for content that violates safety or usage policies—this includes toxicity, hate speech, explicit content, violence, etc. + + + +```python Python +result = evaluator.evaluate( + eval_templates="content_safety_violation", + inputs={ + "output": "The capital of France is Paris." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "content_safety_violation", + { + output: "The capital of France is Paris." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Red Teaming`, `Safety`, `Harmful Objects` diff --git a/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx b/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx new file mode 100644 index 00000000..639fc619 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/custom-code-evaluation.mdx @@ -0,0 +1,49 @@ +--- +title: "Custom Code Evaluation: Built-in Evaluation" +description: "Executes custom Python code for evaluation" +--- + +Executes custom Python code for evaluation. + + + +```python Python +result = evaluator.evaluate( + eval_templates="custom_code_evaluation", + inputs={ + + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "custom_code_evaluation", + { + + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Code`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/detect-hallucination.mdx b/src/pages/docs/evaluation/builtin/detect-hallucination.mdx index 490983ed..38e53bef 100644 --- a/src/pages/docs/evaluation/builtin/detect-hallucination.mdx +++ b/src/pages/docs/evaluation/builtin/detect-hallucination.mdx @@ -43,10 +43,9 @@ console.log(result); | **Input** | | | | | ------ | --------- | ---- | ----------- | | | **Required Input** | **Type** | **Description** | -| | `output` | `string` | Output generated by the model | -| | `context` | `string` | The context provided to the model | -| | **Optional Input** | | | -| | `input` | `string` | Input provided to the model | +| | `output` | `string` | The model's response being evaluated | +| | `context` | `string` | The context the response was supposed to draw from | +| | `input` | `string` | The original prompt or question that produced the output | | **Output** | | | | ------ | ----- | ----------- | diff --git a/src/pages/docs/evaluation/builtin/deterministic-evals.mdx b/src/pages/docs/evaluation/builtin/deterministic-evals.mdx new file mode 100644 index 00000000..9688fef2 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/deterministic-evals.mdx @@ -0,0 +1,53 @@ +--- +title: "Deterministic Evals: Built-in Evaluation" +description: "Evaluates if the output is deterministic or not" +--- + +Evaluates if the output is deterministic or not. + + + +```python Python +result = evaluator.evaluate( + eval_templates="deterministic_evals", + inputs={ + + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "deterministic_evals", + { + + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns one of the predefined categorical labels per row, plus a reason. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/distinct-n.mdx b/src/pages/docs/evaluation/builtin/distinct-n.mdx new file mode 100644 index 00000000..9a6505f8 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/distinct-n.mdx @@ -0,0 +1,50 @@ +--- +title: "Distinct N: Built-in Evaluation" +description: "Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. Higher = more diverse" +--- + +Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. Higher = more diverse. + + + +```python Python +result = evaluator.evaluate( + eval_templates="distinct_n", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "distinct_n", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/ends-with.mdx b/src/pages/docs/evaluation/builtin/ends-with.mdx new file mode 100644 index 00000000..429fa332 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/ends-with.mdx @@ -0,0 +1,50 @@ +--- +title: "Ends With: Built-in Evaluation" +description: "Checks if text ends with specific substring" +--- + +Checks if text ends with specific substring. + + + +```python Python +result = evaluator.evaluate( + eval_templates="ends_with", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "ends_with", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/equals.mdx b/src/pages/docs/evaluation/builtin/equals.mdx new file mode 100644 index 00000000..609d307c --- /dev/null +++ b/src/pages/docs/evaluation/builtin/equals.mdx @@ -0,0 +1,53 @@ +--- +title: "Equals: Built-in Evaluation" +description: "Compares if two texts are exactly equal" +--- + +Compares if two texts are exactly equal. + + + +```python Python +result = evaluator.evaluate( + eval_templates="equals", + inputs={ + "text": "Hello, this is a sample text.", + "expected_text": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "equals", + { + text: "Hello, this is a sample text.", + expected_text: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | +| | `expected_text` | `string` | The expected text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/f-beta-score.mdx b/src/pages/docs/evaluation/builtin/f-beta-score.mdx new file mode 100644 index 00000000..5d372cb9 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/f-beta-score.mdx @@ -0,0 +1,53 @@ +--- +title: "F Beta Score: Built-in Evaluation" +description: "Computes F-beta score with configurable beta for precision/recall weighting. Lower beta favors precision; higher beta favors recall" +--- + +Computes F-beta score with configurable beta for precision/recall weighting. `Beta<1` favors precision, `beta>1` favors recall. + + + +```python Python +result = evaluator.evaluate( + eval_templates="f_beta_score", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "f_beta_score", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/f1-score.mdx b/src/pages/docs/evaluation/builtin/f1-score.mdx new file mode 100644 index 00000000..de79e661 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/f1-score.mdx @@ -0,0 +1,53 @@ +--- +title: "F1 Score: Built-in Evaluation" +description: "Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates the harmonic mean of precision and recall...." +--- + +Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates the harmonic mean of precision and recall. Widely used for QA and extraction tasks. + + + +```python Python +result = evaluator.evaluate( + eval_templates="f1_score", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "f1_score", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/factual-accuracy.mdx b/src/pages/docs/evaluation/builtin/factual-accuracy.mdx new file mode 100644 index 00000000..ecfb7ef1 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/factual-accuracy.mdx @@ -0,0 +1,60 @@ +--- +title: "Factual Accuracy: Built-in Evaluation" +description: "Verifies if the provided output is factually correct or not" +--- + +Verifies if the provided output is factually correct or not. + + + +```python Python +result = evaluator.evaluate( + eval_templates="factual_accuracy", + inputs={ + "input": "What is the capital of France?", + "output": "The capital of France is Paris.", + "context": "Paris is the capital and most populous city of France." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "factual_accuracy", + { + input: "What is the capital of France?", + output: "The capital of France is Paris.", + context: "Paris is the capital and most populous city of France." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `input` | `string` | The input. | +| | `output` | `string` | The output. | +| | `context` | `string` | The context. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Hallucination`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/fleiss-kappa.mdx b/src/pages/docs/evaluation/builtin/fleiss-kappa.mdx new file mode 100644 index 00000000..ca2c1b72 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/fleiss-kappa.mdx @@ -0,0 +1,50 @@ +--- +title: "Fleiss Kappa: Built-in Evaluation" +description: "Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects, columns=categories, values=rater counts" +--- + +Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects, columns=categories, values=rater counts. + + + +```python Python +result = evaluator.evaluate( + eval_templates="fleiss_kappa", + inputs={ + "output": "The capital of France is Paris." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "fleiss_kappa", + { + output: "The capital of France is Paris." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/gleu-score.mdx b/src/pages/docs/evaluation/builtin/gleu-score.mdx new file mode 100644 index 00000000..731842e9 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/gleu-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Gleu Score: Built-in Evaluation" +description: "Computes Google BLEU (GLEU) score. A sentence-level BLEU variant that takes the minimum of precision and recall for each n-gram order, making it more balance..." +--- + +Computes Google BLEU (GLEU) score. A sentence-level BLEU variant that takes the minimum of precision and recall for each n-gram order, making it more balanced than standard BLEU. + + + +```python Python +result = evaluator.evaluate( + eval_templates="gleu_score", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "gleu_score", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/groundedness.mdx b/src/pages/docs/evaluation/builtin/groundedness.mdx index 2df4293f..032fb810 100644 --- a/src/pages/docs/evaluation/builtin/groundedness.mdx +++ b/src/pages/docs/evaluation/builtin/groundedness.mdx @@ -46,10 +46,9 @@ console.log(result); | **Input** | | | | | ------ | --------- | ---- | ----------- | | | **Required Input** |**Type** | **Description** | -| | `output` | `string` | The output generated by the model | -| | `context` | `string` | The context provided to the model | -| | **Optional Input** ||| -| | `input` | `string` | The input provided to the model | +| | `output` | `string` | The model's response being evaluated | +| | `context` | `string` | The context the response was supposed to be grounded in | +| | `input` | `string` | The original prompt or question that produced the output | | **Output** | | | | ------ | ----- | ----------- | diff --git a/src/pages/docs/evaluation/builtin/hamming-similarity.mdx b/src/pages/docs/evaluation/builtin/hamming-similarity.mdx new file mode 100644 index 00000000..71175763 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/hamming-similarity.mdx @@ -0,0 +1,53 @@ +--- +title: "Hamming Similarity: Built-in Evaluation" +description: "Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string length. Pads the shorter string for uneq..." +--- + +Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string length. Pads the shorter string for unequal lengths. + + + +```python Python +result = evaluator.evaluate( + eval_templates="hamming_similarity", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "hamming_similarity", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/image-properties.mdx b/src/pages/docs/evaluation/builtin/image-properties.mdx new file mode 100644 index 00000000..42bd57d8 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/image-properties.mdx @@ -0,0 +1,50 @@ +--- +title: "Image Properties: Built-in Evaluation" +description: "Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet specific requirements" +--- + +Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet specific requirements. + + + +```python Python +result = evaluator.evaluate( + eval_templates="image_properties", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "image_properties", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Image`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/index.mdx b/src/pages/docs/evaluation/builtin/index.mdx index 3510cd03..fdaa1045 100644 --- a/src/pages/docs/evaluation/builtin/index.mdx +++ b/src/pages/docs/evaluation/builtin/index.mdx @@ -86,3 +86,78 @@ description: "Complete reference for all built-in evaluation templates available | [**FID Score**](/docs/evaluation/builtin/fid-score) | Computes the Fréchet Inception Distance between two sets of images; lower scores indicate more similar image distributions. | `real_images`, `fake_images` | Image | Statistical Metric | | [**CLIP Score**](/docs/evaluation/builtin/clip-score) | Measures how well images match their text descriptions; higher scores indicate better image-text alignment (range: 0–100). | `images`, `text` | Image | Statistical Metric | | [**Image Instruction Adherence**](/docs/evaluation/builtin/image-instruction-adherence) | Measures how well generated images adhere to a given text instruction across subject, style, and composition. | `instruction`, `images` | Image | LLM as Judge | +| [**Accuracy**](/docs/evaluation/builtin/accuracy) | Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comparison. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Answer Similarity**](/docs/evaluation/builtin/answer-similarity) | Evaluates the similarity between the expected and actual responses. | `expected_response`, `response` | NLP Metrics, Output Validation | Statistical Metric | +| [**Api Call**](/docs/evaluation/builtin/api-call) | Makes an API call and evaluates the response. | `response` | Code, Output Validation | Deterministic / Rule-based | +| [**Balanced Accuracy**](/docs/evaluation/builtin/balanced-accuracy) | Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Character Error Rate**](/docs/evaluation/builtin/character-error-rate) | Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as score (higher=better). | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Chrf Score**](/docs/evaluation/builtin/chrf-score) | Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to order 6 with recall-weighted F-score. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**Code Bleu**](/docs/evaluation/builtin/code-bleu) | Computes CodeBLEU - a code-aware BLEU variant that combines standard n-gram BLEU with keyword matching for code-specific tokens (def, class, return, if, for, etc). Better than standard BLEU for evaluating code generation. | `reference`, `hypothesis` | Code, NLP Metrics | Statistical Metric | +| [**Code Complexity**](/docs/evaluation/builtin/code-complexity) | Computes cyclomatic complexity of Python code using AST analysis. Counts decision points (if, for, while, except, boolean ops). Lower complexity = higher score. Useful for code quality evaluation. | `text` | Code, NLP Metrics | Statistical Metric | +| [**Cohen Kappa**](/docs/evaluation/builtin/cohen-kappa) | Computes Cohen's Kappa coefficient for inter-rater agreement. Accounts for agreement occurring by chance. Range -1 to 1, normalized to 0-1 for scoring. Useful for classification evaluation with imbalanced classes. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Contains All**](/docs/evaluation/builtin/contains-all) | Verifies text contains all specified keywords. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Contains Any**](/docs/evaluation/builtin/contains-any) | Checks if the text contains any of the specified keywords. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Contains None**](/docs/evaluation/builtin/contains-none) | Verifies text contains none of specified terms. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Content Moderation**](/docs/evaluation/builtin/content-moderation) | Uses content moderation to evaluate content safety. | `output` | Red Teaming, Safety, Harmful Objects | LLM as Judge | +| [**Content Safety Violation**](/docs/evaluation/builtin/content-safety-violation) | A broad check for content that violates safety or usage policies—this includes toxicity, hate speech, explicit content, violence, etc. | `output` | Red Teaming, Safety, Harmful Objects | LLM as Judge | +| [**Custom Code Evaluation**](/docs/evaluation/builtin/custom-code-evaluation) | Executes custom Python code for evaluation. | — | Code, Output Validation | Deterministic / Rule-based | +| [**Deterministic Evals**](/docs/evaluation/builtin/deterministic-evals) | Evaluates if the output is deterministic or not. | — | Output Validation | LLM as Judge | +| [**Distinct N**](/docs/evaluation/builtin/distinct-n) | Computes Distinct-N: ratio of unique n-grams to total n-grams. Measures vocabulary diversity in generated text. Higher = more diverse. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Ends With**](/docs/evaluation/builtin/ends-with) | Checks if text ends with specific substring. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Equals**](/docs/evaluation/builtin/equals) | Compares if two texts are exactly equal. | `text`, `expected_text` | Output Validation | Deterministic / Rule-based | +| [**F Beta Score**](/docs/evaluation/builtin/f-beta-score) | Computes F-beta score with configurable beta for precision/recall weighting. `Beta<1` favors precision, `beta>1` favors recall. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**F1 Score**](/docs/evaluation/builtin/f1-score) | Computes token-level F1 score between output and expected text. Treats both texts as bags of tokens and calculates the harmonic mean of precision and recall. Widely used for QA and extraction tasks. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Factual Accuracy**](/docs/evaluation/builtin/factual-accuracy) | Verifies if the provided output is factually correct or not. | `input`, `output`, `context` | Hallucination, NLP Metrics | LLM as Judge | +| [**Fleiss Kappa**](/docs/evaluation/builtin/fleiss-kappa) | Computes Fleiss' Kappa for multi-rater agreement. Extends Cohen's Kappa to N raters. Input: matrix where rows=subjects, columns=categories, values=rater counts. | `output` | NLP Metrics, Output Validation | Statistical Metric | +| [**Gleu Score**](/docs/evaluation/builtin/gleu-score) | Computes Google BLEU (GLEU) score. A sentence-level BLEU variant that takes the minimum of precision and recall for each n-gram order, making it more balanced than standard BLEU. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**Hamming Similarity**](/docs/evaluation/builtin/hamming-similarity) | Computes Hamming similarity between two strings. Counts matching character positions normalized by the longer string length. Pads the shorter string for unequal lengths. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Image Properties**](/docs/evaluation/builtin/image-properties) | Validates image properties including dimensions, format, and file size. Useful for ensuring generated images meet specific requirements. | `text` | Image, Output Validation | Deterministic / Rule-based | +| [**Is Compliant**](/docs/evaluation/builtin/is-compliant) | Ensures that the output adheres to legal, regulatory, or organizational policies (e.g., HIPAA, GDPR, company rules). | `output` | Safety, Output Validation | LLM as Judge | +| [**Is Factually Consistent**](/docs/evaluation/builtin/is-factually-consistent) | Checks if the generated output is factually consistent with the source/context (e.g., input text or documents). | `input`, `output`, `context` | Hallucination, NLP Metrics | LLM as Judge | +| [**Is Html**](/docs/evaluation/builtin/is-html) | Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates that all non-void tags are properly closed. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Is Refusal**](/docs/evaluation/builtin/is-refusal) | Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected. | `text` | Safety, Output Validation | Deterministic / Rule-based | +| [**Is Sql**](/docs/evaluation/builtin/is-sql) | Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, and balanced quotes. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Is Url**](/docs/evaluation/builtin/is-url) | Validates if text is a properly formatted URL with a valid scheme and network location. | `text` | Output Format, Output Validation | Deterministic / Rule-based | +| [**Is Xml**](/docs/evaluation/builtin/is-xml) | Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document. | `text` | Output Format, Code | Deterministic / Rule-based | +| [**Jaccard Similarity**](/docs/evaluation/builtin/jaccard-similarity) | Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level overlap regardless of frequency or order. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Jaro Winkler Similarity**](/docs/evaluation/builtin/jaro-winkler-similarity) | Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, and identifiers. Adds a prefix bonus to the base Jaro distance. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Json Diff**](/docs/evaluation/builtin/json-diff) | Deep structural comparison between two JSON objects. Recursively compares keys and values at all levels, returning a score based on the fraction of matching nodes. Useful for evaluating structured output generation. | `output`, `expected` | Output Format, Output Validation | Deterministic / Rule-based | +| [**Json Scheme Validation**](/docs/evaluation/builtin/json-scheme-validation) | Validates JSON against specified criteria. | `actual_json`, `expected_json` | Output Format, Output Validation | Deterministic / Rule-based | +| [**Latency Check**](/docs/evaluation/builtin/latency-check) | Validates that response latency is within acceptable bounds. Pass if `latency <= max_latency_ms`. | `text` | Output Validation | Deterministic / Rule-based | +| [**Length Between**](/docs/evaluation/builtin/length-between) | Checks if the text length is between specified min and max values. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Length Greater Than**](/docs/evaluation/builtin/length-greater-than) | Checks if the text length is greater than a specified value. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Length Less Than**](/docs/evaluation/builtin/length-less-than) | Checks if text length is below threshold. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Log Loss**](/docs/evaluation/builtin/log-loss) | Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Match Error Rate**](/docs/evaluation/builtin/match-error-rate) | Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Matthews Correlation**](/docs/evaluation/builtin/matthews-correlation) | Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that produces high scores only when the prediction obtains good results in all four confusion matrix categories. Range -1 to 1, normalized to 0-1. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Mean Average Precision**](/docs/evaluation/builtin/mean-average-precision) | Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries. | `reference`, `hypothesis` | RAG, Retrieval Systems, NLP Metrics | Statistical Metric | +| [**Meteor Score**](/docs/evaluation/builtin/meteor-score) | Computes METEOR score between reference and hypothesis. Uses unigram matching with exact and stem matching, penalizing fragmentation. More correlated with human judgment than BLEU for many tasks. | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**Non Llm Context Precision**](/docs/evaluation/builtin/non-llm-context-precision) | Non-LLM context precision for RAG evaluation. Measures what fraction of retrieved contexts match reference contexts using exact string matching. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | +| [**Non Llm Context Recall**](/docs/evaluation/builtin/non-llm-context-recall) | Non-LLM context recall for RAG evaluation. Measures what fraction of reference contexts were successfully retrieved. | `output`, `expected` | RAG, Retrieval Systems | Deterministic / Rule-based | +| [**One Line**](/docs/evaluation/builtin/one-line) | Checks if the text is a single line. | `text` | Output Validation, Output Format | Deterministic / Rule-based | +| [**Pearson Correlation**](/docs/evaluation/builtin/pearson-correlation) | Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized to 0-1). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Precision Score**](/docs/evaluation/builtin/precision-score) | Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions are actually correct. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Prompt Adherence**](/docs/evaluation/builtin/prompt-adherence) | Assesses how closely the output follows the given prompt instructions, checking for completion of all requested tasks and adherence to specified constraints or formats. Evaluates both explicit and implicit requirements in the prompt. | `input`, `output` | NLP Metrics, Output Validation | LLM as Judge | +| [**Psnr**](/docs/evaluation/builtin/psnr) | Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns a normalized score (0-1) where PSNR is mapped from 0-50 dB range. | `output`, `expected` | Image, NLP Metrics | Statistical Metric | +| [**R2 Score**](/docs/evaluation/builtin/r2-score) | Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions. | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Readability Score**](/docs/evaluation/builtin/readability-score) | Computes Flesch-Kincaid readability metrics. Returns a normalized score (0-1) based on Flesch Reading Ease. Higher scores indicate more readable text. Also reports grade level. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Recall Score**](/docs/evaluation/builtin/recall-score) | Recall: Out of all ground-truth relevant chunks, what fraction was retrieved. | `hypothesis`, `reference` | NLP Metrics, Output Validation | Statistical Metric | +| [**Regex**](/docs/evaluation/builtin/regex) | Checks if the text matches a specified regex pattern. | `text` | Output Validation, Output Format | Deterministic / Rule-based | +| [**Regex Pii Detection**](/docs/evaluation/builtin/regex-pii-detection) | Detects Personally Identifiable Information (PII) using regex patterns. Scans for SSN, credit card numbers, phone numbers, email addresses, and IP addresses. Returns pass (no PII) or fail (PII detected). | `text` | Data Leakage, Safety | Deterministic / Rule-based | +| [**Repetition Rate**](/docs/evaluation/builtin/repetition-rate) | Measures repeated n-gram rate in text. Returns 1-rate as score (higher = less repetitive = better). Useful for detecting degenerate/looping LLM outputs. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Rmse**](/docs/evaluation/builtin/rmse) | Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Sentence Count**](/docs/evaluation/builtin/sentence-count) | Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints on generated text. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Spearman Correlation**](/docs/evaluation/builtin/spearman-correlation) | Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1). | `output`, `expected` | NLP Metrics, Output Validation | Statistical Metric | +| [**Squad Score**](/docs/evaluation/builtin/squad-score) | Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before comparing. | `output`, `expected` | NLP Metrics, RAG | Statistical Metric | +| [**Ssim**](/docs/evaluation/builtin/ssim) | Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, contrast, and structure. Score ranges from 0 (completely different) to 1 (identical). | `output`, `expected` | Image, NLP Metrics | Statistical Metric | +| [**Starts With**](/docs/evaluation/builtin/starts-with) | Checks if text begins with specific substring. | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Step Count**](/docs/evaluation/builtin/step-count) | Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, maximum, or range. | `output` | Agents, Output Validation | Deterministic / Rule-based | +| [**Syntax Validation**](/docs/evaluation/builtin/syntax-validation) | Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket validation. Useful for checking if LLM-generated code is syntactically correct. | `text` | Code, Output Validation | Deterministic / Rule-based | +| [**Tool Call Accuracy**](/docs/evaluation/builtin/tool-call-accuracy) | Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and arguments, supporting both OpenAI and generic tool call formats. Scores full matches (name+args) at 1.0 and name-only matches at 0.5. | `output`, `expected` | Agents, Code | Deterministic / Rule-based | +| [**Trajectory Match**](/docs/evaluation/builtin/trajectory-match) | Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected in actual), and superset (actual in expected) matching modes. | `output`, `expected` | Agents, Output Validation | Deterministic / Rule-based | +| [**Translation Edit Rate**](/docs/evaluation/builtin/translation-edit-rate) | Computes Translation Edit Rate (TER). TER measures the minimum number of edits (insertions, deletions, substitutions) needed to transform the hypothesis into the reference, normalized by reference length. Returns 1-TER (higher=better). | `reference`, `hypothesis` | NLP Metrics, Text | Statistical Metric | +| [**Type Token Ratio**](/docs/evaluation/builtin/type-token-ratio) | Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. Measures lexical diversity. | `text` | NLP Metrics, Text | Statistical Metric | +| [**Word Count In Range**](/docs/evaluation/builtin/word-count-in-range) | Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated responses (e.g., summaries, tweets, abstracts). | `text` | Output Validation, Text | Deterministic / Rule-based | +| [**Word Error Rate**](/docs/evaluation/builtin/word-error-rate) | Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between reference and hypothesis transcriptions. Returns 1-WER as score (higher=better). | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Word Info Lost**](/docs/evaluation/builtin/word-info-lost) | Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | +| [**Word Info Preserved**](/docs/evaluation/builtin/word-info-preserved) | Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better. | `reference`, `hypothesis` | NLP Metrics, Audio | Statistical Metric | diff --git a/src/pages/docs/evaluation/builtin/is-compliant.mdx b/src/pages/docs/evaluation/builtin/is-compliant.mdx new file mode 100644 index 00000000..cfb8144a --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-compliant.mdx @@ -0,0 +1,54 @@ +--- +title: "Is Compliant: Built-in Evaluation" +description: "Ensures that the output adheres to legal, regulatory, or organizational policies (e.g., HIPAA, GDPR, company rules)" +--- + +Ensures that the output adheres to legal, regulatory, or organizational policies (e.g., HIPAA, GDPR, company rules). + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_compliant", + inputs={ + "output": "The capital of France is Paris." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_compliant", + { + output: "The capital of France is Paris." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Safety`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx b/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx new file mode 100644 index 00000000..01de04e2 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-factually-consistent.mdx @@ -0,0 +1,60 @@ +--- +title: "Is Factually Consistent: Built-in Evaluation" +description: "Checks if the generated output is factually consistent with the source/context (e.g., input text or documents)" +--- + +Checks if the generated output is factually consistent with the source/context (e.g., input text or documents). + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_factually_consistent", + inputs={ + "input": "What is the capital of France?", + "output": "The capital of France is Paris.", + "context": "Paris is the capital and most populous city of France." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_factually_consistent", + { + input: "What is the capital of France?", + output: "The capital of France is Paris.", + context: "Paris is the capital and most populous city of France." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `input` | `string` | The input. | +| | `output` | `string` | The output. | +| | `context` | `string` | The context. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Hallucination`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/is-html.mdx b/src/pages/docs/evaluation/builtin/is-html.mdx new file mode 100644 index 00000000..b042b969 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-html.mdx @@ -0,0 +1,50 @@ +--- +title: "Is Html: Built-in Evaluation" +description: "Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates that all non-void tags are properly c..." +--- + +Validates if text contains well-formed HTML with proper tag nesting. Checks for the presence of HTML tags and validates that all non-void tags are properly closed. + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_html", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_html", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Code` diff --git a/src/pages/docs/evaluation/builtin/is-refusal.mdx b/src/pages/docs/evaluation/builtin/is-refusal.mdx new file mode 100644 index 00000000..9cc856c1 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-refusal.mdx @@ -0,0 +1,50 @@ +--- +title: "Is Refusal: Built-in Evaluation" +description: "Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected" +--- + +Detects if LLM output is a refusal to answer using common refusal pattern matching. Returns True if refusal detected. + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_refusal", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_refusal", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Safety`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/is-sql.mdx b/src/pages/docs/evaluation/builtin/is-sql.mdx new file mode 100644 index 00000000..76f12736 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-sql.mdx @@ -0,0 +1,50 @@ +--- +title: "Is Sql: Built-in Evaluation" +description: "Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, and balanced quotes" +--- + +Validates if text appears to be syntactically valid SQL. Checks for recognized SQL keywords, balanced parentheses, and balanced quotes. + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_sql", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_sql", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Code` diff --git a/src/pages/docs/evaluation/builtin/is-url.mdx b/src/pages/docs/evaluation/builtin/is-url.mdx new file mode 100644 index 00000000..dd02c95c --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-url.mdx @@ -0,0 +1,50 @@ +--- +title: "Is Url: Built-in Evaluation" +description: "Validates if text is a properly formatted URL with a valid scheme and network location" +--- + +Validates if text is a properly formatted URL with a valid scheme and network location. + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_url", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_url", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/is-xml.mdx b/src/pages/docs/evaluation/builtin/is-xml.mdx new file mode 100644 index 00000000..36ca4eef --- /dev/null +++ b/src/pages/docs/evaluation/builtin/is-xml.mdx @@ -0,0 +1,50 @@ +--- +title: "Is Xml: Built-in Evaluation" +description: "Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document" +--- + +Validates if text is well-formed XML. Checks that the text can be parsed as a valid XML document. + + + +```python Python +result = evaluator.evaluate( + eval_templates="is_xml", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "is_xml", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Code` diff --git a/src/pages/docs/evaluation/builtin/jaccard-similarity.mdx b/src/pages/docs/evaluation/builtin/jaccard-similarity.mdx new file mode 100644 index 00000000..7b7363a4 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/jaccard-similarity.mdx @@ -0,0 +1,53 @@ +--- +title: "Jaccard Similarity: Built-in Evaluation" +description: "Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level overlap regardless of frequency or order" +--- + +Computes Jaccard similarity (intersection over union) between token sets of two texts. Useful for measuring set-level overlap regardless of frequency or order. + + + +```python Python +result = evaluator.evaluate( + eval_templates="jaccard_similarity", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "jaccard_similarity", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/jaro-winkler-similarity.mdx b/src/pages/docs/evaluation/builtin/jaro-winkler-similarity.mdx new file mode 100644 index 00000000..27826340 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/jaro-winkler-similarity.mdx @@ -0,0 +1,53 @@ +--- +title: "Jaro Winkler Similarity: Built-in Evaluation" +description: "Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, and identifiers. Adds a prefix bonus to th..." +--- + +Computes Jaro-Winkler similarity between two strings. Particularly effective for short strings like names, labels, and identifiers. Adds a prefix bonus to the base Jaro distance. + + + +```python Python +result = evaluator.evaluate( + eval_templates="jaro_winkler_similarity", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "jaro_winkler_similarity", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/json-diff.mdx b/src/pages/docs/evaluation/builtin/json-diff.mdx new file mode 100644 index 00000000..2ad9ce6d --- /dev/null +++ b/src/pages/docs/evaluation/builtin/json-diff.mdx @@ -0,0 +1,53 @@ +--- +title: "Json Diff: Built-in Evaluation" +description: "Deep structural comparison between two JSON objects. Recursively compares keys and values at all levels, returning a score based on the fraction of matching ..." +--- + +Deep structural comparison between two JSON objects. Recursively compares keys and values at all levels, returning a score based on the fraction of matching nodes. Useful for evaluating structured output generation. + + + +```python Python +result = evaluator.evaluate( + eval_templates="json_diff", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "json_diff", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx b/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx new file mode 100644 index 00000000..fe068292 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/json-scheme-validation.mdx @@ -0,0 +1,53 @@ +--- +title: "Json Scheme Validation: Built-in Evaluation" +description: "Validates JSON against specified criteria" +--- + +Validates JSON against specified criteria. + + + +```python Python +result = evaluator.evaluate( + eval_templates="json_scheme_validation", + inputs={ + "actual_json": "...", + "expected_json": "..." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "json_scheme_validation", + { + actual_json: "...", + expected_json: "..." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `actual_json` | `string` | The actual json. | +| | `expected_json` | `string` | The expected json. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Format`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/latency-check.mdx b/src/pages/docs/evaluation/builtin/latency-check.mdx new file mode 100644 index 00000000..1b464bf7 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/latency-check.mdx @@ -0,0 +1,50 @@ +--- +title: "Latency Check: Built-in Evaluation" +description: "Validates that response latency is within acceptable bounds. Pass if `latency <= max_latency_ms`" +--- + +Validates that response latency is within acceptable bounds. Pass if `latency <= max_latency_ms`. + + + +```python Python +result = evaluator.evaluate( + eval_templates="latency_check", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "latency_check", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/length-between.mdx b/src/pages/docs/evaluation/builtin/length-between.mdx new file mode 100644 index 00000000..5799bcfa --- /dev/null +++ b/src/pages/docs/evaluation/builtin/length-between.mdx @@ -0,0 +1,50 @@ +--- +title: "Length Between: Built-in Evaluation" +description: "Checks if the text length is between specified min and max values" +--- + +Checks if the text length is between specified min and max values. + + + +```python Python +result = evaluator.evaluate( + eval_templates="length_between", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "length_between", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/length-greater-than.mdx b/src/pages/docs/evaluation/builtin/length-greater-than.mdx new file mode 100644 index 00000000..b86902fc --- /dev/null +++ b/src/pages/docs/evaluation/builtin/length-greater-than.mdx @@ -0,0 +1,50 @@ +--- +title: "Length Greater Than: Built-in Evaluation" +description: "Checks if the text length is greater than a specified value" +--- + +Checks if the text length is greater than a specified value. + + + +```python Python +result = evaluator.evaluate( + eval_templates="length_greater_than", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "length_greater_than", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/length-less-than.mdx b/src/pages/docs/evaluation/builtin/length-less-than.mdx new file mode 100644 index 00000000..8a45fbe0 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/length-less-than.mdx @@ -0,0 +1,50 @@ +--- +title: "Length Less Than: Built-in Evaluation" +description: "Checks if text length is below threshold" +--- + +Checks if text length is below threshold. + + + +```python Python +result = evaluator.evaluate( + eval_templates="length_less_than", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "length_less_than", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/log-loss.mdx b/src/pages/docs/evaluation/builtin/log-loss.mdx new file mode 100644 index 00000000..27a9d836 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/log-loss.mdx @@ -0,0 +1,53 @@ +--- +title: "Log Loss: Built-in Evaluation" +description: "Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score" +--- + +Computes log loss (cross-entropy) for probability predictions. Returns 1/(1+loss) as score. Lower loss = higher score. + + + +```python Python +result = evaluator.evaluate( + eval_templates="log_loss", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "log_loss", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/match-error-rate.mdx b/src/pages/docs/evaluation/builtin/match-error-rate.mdx new file mode 100644 index 00000000..0ba0467a --- /dev/null +++ b/src/pages/docs/evaluation/builtin/match-error-rate.mdx @@ -0,0 +1,53 @@ +--- +title: "Match Error Rate: Built-in Evaluation" +description: "Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score" +--- + +Computes Match Error Rate (MER) for speech recognition. MER = edits / (hits + edits). Returns 1-MER as score. + + + +```python Python +result = evaluator.evaluate( + eval_templates="match_error_rate", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "match_error_rate", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Audio` diff --git a/src/pages/docs/evaluation/builtin/matthews-correlation.mdx b/src/pages/docs/evaluation/builtin/matthews-correlation.mdx new file mode 100644 index 00000000..f874bbac --- /dev/null +++ b/src/pages/docs/evaluation/builtin/matthews-correlation.mdx @@ -0,0 +1,53 @@ +--- +title: "Matthews Correlation: Built-in Evaluation" +description: "Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that produces high scores only when the predictio..." +--- + +Computes Matthews Correlation Coefficient (MCC). A balanced metric for binary and multiclass classification that produces high scores only when the prediction obtains good results in all four confusion matrix categories. Range -1 to 1, normalized to 0-1. + + + +```python Python +result = evaluator.evaluate( + eval_templates="matthews_correlation", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "matthews_correlation", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/mean-average-precision.mdx b/src/pages/docs/evaluation/builtin/mean-average-precision.mdx new file mode 100644 index 00000000..62442e6d --- /dev/null +++ b/src/pages/docs/evaluation/builtin/mean-average-precision.mdx @@ -0,0 +1,53 @@ +--- +title: "Mean Average Precision: Built-in Evaluation" +description: "Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries" +--- + +Computes Mean Average Precision (MAP) for information retrieval. Averages precision at each relevant item across queries. + + + +```python Python +result = evaluator.evaluate( + eval_templates="mean_average_precision", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "mean_average_precision", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `RAG`, `Retrieval Systems`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/meteor-score.mdx b/src/pages/docs/evaluation/builtin/meteor-score.mdx new file mode 100644 index 00000000..cbcbbd8c --- /dev/null +++ b/src/pages/docs/evaluation/builtin/meteor-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Meteor Score: Built-in Evaluation" +description: "Computes METEOR score between reference and hypothesis. Uses unigram matching with exact and stem matching, penalizing fragmentation. More correlated with hu..." +--- + +Computes METEOR score between reference and hypothesis. Uses unigram matching with exact and stem matching, penalizing fragmentation. More correlated with human judgment than BLEU for many tasks. + + + +```python Python +result = evaluator.evaluate( + eval_templates="meteor_score", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "meteor_score", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/non-llm-context-precision.mdx b/src/pages/docs/evaluation/builtin/non-llm-context-precision.mdx new file mode 100644 index 00000000..254d4d0f --- /dev/null +++ b/src/pages/docs/evaluation/builtin/non-llm-context-precision.mdx @@ -0,0 +1,53 @@ +--- +title: "Non Llm Context Precision: Built-in Evaluation" +description: "Non-LLM context precision for RAG evaluation. Measures what fraction of retrieved contexts match reference contexts using exact string matching" +--- + +Non-LLM context precision for RAG evaluation. Measures what fraction of retrieved contexts match reference contexts using exact string matching. + + + +```python Python +result = evaluator.evaluate( + eval_templates="non_llm_context_precision", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "non_llm_context_precision", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `RAG`, `Retrieval Systems` diff --git a/src/pages/docs/evaluation/builtin/non-llm-context-recall.mdx b/src/pages/docs/evaluation/builtin/non-llm-context-recall.mdx new file mode 100644 index 00000000..8ab30f19 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/non-llm-context-recall.mdx @@ -0,0 +1,53 @@ +--- +title: "Non Llm Context Recall: Built-in Evaluation" +description: "Non-LLM context recall for RAG evaluation. Measures what fraction of reference contexts were successfully retrieved" +--- + +Non-LLM context recall for RAG evaluation. Measures what fraction of reference contexts were successfully retrieved. + + + +```python Python +result = evaluator.evaluate( + eval_templates="non_llm_context_recall", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "non_llm_context_recall", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `RAG`, `Retrieval Systems` diff --git a/src/pages/docs/evaluation/builtin/one-line.mdx b/src/pages/docs/evaluation/builtin/one-line.mdx new file mode 100644 index 00000000..b8393d09 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/one-line.mdx @@ -0,0 +1,50 @@ +--- +title: "One Line: Built-in Evaluation" +description: "Checks if the text is a single line" +--- + +Checks if the text is a single line. + + + +```python Python +result = evaluator.evaluate( + eval_templates="one_line", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "one_line", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Output Format` diff --git a/src/pages/docs/evaluation/builtin/pearson-correlation.mdx b/src/pages/docs/evaluation/builtin/pearson-correlation.mdx new file mode 100644 index 00000000..f90a230d --- /dev/null +++ b/src/pages/docs/evaluation/builtin/pearson-correlation.mdx @@ -0,0 +1,53 @@ +--- +title: "Pearson Correlation: Built-in Evaluation" +description: "Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized to 0-1)" +--- + +Computes Pearson correlation coefficient between two sets of numeric values. Measures linear relationship strength (-1 to 1, normalized to 0-1). + + + +```python Python +result = evaluator.evaluate( + eval_templates="pearson_correlation", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "pearson_correlation", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/precision-score.mdx b/src/pages/docs/evaluation/builtin/precision-score.mdx new file mode 100644 index 00000000..749eddc9 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/precision-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Precision Score: Built-in Evaluation" +description: "Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions are actually correct" +--- + +Computes classification precision (TP / (TP + FP)) for binary or multiclass tasks. Measures how many positive predictions are actually correct. + + + +```python Python +result = evaluator.evaluate( + eval_templates="precision_score", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "precision_score", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/prompt-adherence.mdx b/src/pages/docs/evaluation/builtin/prompt-adherence.mdx new file mode 100644 index 00000000..9c92d9c3 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/prompt-adherence.mdx @@ -0,0 +1,57 @@ +--- +title: "Prompt Adherence: Built-in Evaluation" +description: "Assesses how closely the output follows the given prompt instructions, checking for completion of all requested tasks and adherence to specified constraints ..." +--- + +Assesses how closely the output follows the given prompt instructions, checking for completion of all requested tasks and adherence to specified constraints or formats. Evaluates both explicit and implicit requirements in the prompt. + + + +```python Python +result = evaluator.evaluate( + eval_templates="prompt_adherence", + inputs={ + "input": "What is the capital of France?", + "output": "The capital of France is Paris." + }, + model_name="turing_flash" +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "prompt_adherence", + { + input: "What is the capital of France?", + output: "The capital of France is Paris." + }, + { + modelName: "turing_flash", + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `input` | `string` | The input. | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/psnr.mdx b/src/pages/docs/evaluation/builtin/psnr.mdx new file mode 100644 index 00000000..a5596307 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/psnr.mdx @@ -0,0 +1,53 @@ +--- +title: "Psnr: Built-in Evaluation" +description: "Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns a normalized score (0-1) where PSNR is mapp..." +--- + +Computes Peak Signal-to-Noise Ratio (PSNR) between two images. Higher PSNR indicates more similar images. Returns a normalized score (0-1) where PSNR is mapped from 0-50 dB range. + + + +```python Python +result = evaluator.evaluate( + eval_templates="psnr", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "psnr", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Image`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/r2-score.mdx b/src/pages/docs/evaluation/builtin/r2-score.mdx new file mode 100644 index 00000000..d1199be6 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/r2-score.mdx @@ -0,0 +1,53 @@ +--- +title: "R2 Score: Built-in Evaluation" +description: "Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions" +--- + +Computes R-squared (coefficient of determination). Measures proportion of variance explained by predictions. + + + +```python Python +result = evaluator.evaluate( + eval_templates="r2_score", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "r2_score", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/readability-score.mdx b/src/pages/docs/evaluation/builtin/readability-score.mdx new file mode 100644 index 00000000..437115d3 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/readability-score.mdx @@ -0,0 +1,50 @@ +--- +title: "Readability Score: Built-in Evaluation" +description: "Computes Flesch-Kincaid readability metrics. Returns a normalized score (0-1) based on Flesch Reading Ease. Higher scores indicate more readable text. Also r..." +--- + +Computes Flesch-Kincaid readability metrics. Returns a normalized score (0-1) based on Flesch Reading Ease. Higher scores indicate more readable text. Also reports grade level. + + + +```python Python +result = evaluator.evaluate( + eval_templates="readability_score", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "readability_score", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/recall-score.mdx b/src/pages/docs/evaluation/builtin/recall-score.mdx new file mode 100644 index 00000000..a46f5dbb --- /dev/null +++ b/src/pages/docs/evaluation/builtin/recall-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Recall Score: Built-in Evaluation" +description: "Recall: Out of all ground-truth relevant chunks, what fraction was retrieved" +--- + +Recall: Out of all ground-truth relevant chunks, what fraction was retrieved. + + + +```python Python +result = evaluator.evaluate( + eval_templates="recall_score", + inputs={ + "hypothesis": "Paris is the capital of France.", + "reference": "The capital of France is Paris." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "recall_score", + { + hypothesis: "Paris is the capital of France.", + reference: "The capital of France is Paris." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `hypothesis` | `string` | The hypothesis. | +| | `reference` | `string` | The reference. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/regex-pii-detection.mdx b/src/pages/docs/evaluation/builtin/regex-pii-detection.mdx new file mode 100644 index 00000000..3566231d --- /dev/null +++ b/src/pages/docs/evaluation/builtin/regex-pii-detection.mdx @@ -0,0 +1,50 @@ +--- +title: "Regex Pii Detection: Built-in Evaluation" +description: "Detects Personally Identifiable Information (PII) using regex patterns. Scans for SSN, credit card numbers, phone numbers, email addresses, and IP addresses...." +--- + +Detects Personally Identifiable Information (PII) using regex patterns. Scans for SSN, credit card numbers, phone numbers, email addresses, and IP addresses. Returns pass (no PII) or fail (PII detected). + + + +```python Python +result = evaluator.evaluate( + eval_templates="regex_pii_detection", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "regex_pii_detection", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Data Leakage`, `Safety` diff --git a/src/pages/docs/evaluation/builtin/regex.mdx b/src/pages/docs/evaluation/builtin/regex.mdx new file mode 100644 index 00000000..d975cc96 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/regex.mdx @@ -0,0 +1,50 @@ +--- +title: "Regex: Built-in Evaluation" +description: "Checks if the text matches a specified regex pattern" +--- + +Checks if the text matches a specified regex pattern. + + + +```python Python +result = evaluator.evaluate( + eval_templates="regex", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "regex", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Output Format` diff --git a/src/pages/docs/evaluation/builtin/repetition-rate.mdx b/src/pages/docs/evaluation/builtin/repetition-rate.mdx new file mode 100644 index 00000000..985eb227 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/repetition-rate.mdx @@ -0,0 +1,50 @@ +--- +title: "Repetition Rate: Built-in Evaluation" +description: "Measures repeated n-gram rate in text. Returns 1-rate as score (higher = less repetitive = better). Useful for detecting degenerate/looping LLM outputs" +--- + +Measures repeated n-gram rate in text. Returns 1-rate as score (higher = less repetitive = better). Useful for detecting degenerate/looping LLM outputs. + + + +```python Python +result = evaluator.evaluate( + eval_templates="repetition_rate", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "repetition_rate", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/rmse.mdx b/src/pages/docs/evaluation/builtin/rmse.mdx new file mode 100644 index 00000000..1348ed97 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/rmse.mdx @@ -0,0 +1,53 @@ +--- +title: "Rmse: Built-in Evaluation" +description: "Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better)" +--- + +Computes Root Mean Squared Error between predicted and actual values. Returns 1/(1+RMSE) as score (higher=better). + + + +```python Python +result = evaluator.evaluate( + eval_templates="rmse", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "rmse", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/sentence-count.mdx b/src/pages/docs/evaluation/builtin/sentence-count.mdx new file mode 100644 index 00000000..e3bca752 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/sentence-count.mdx @@ -0,0 +1,50 @@ +--- +title: "Sentence Count: Built-in Evaluation" +description: "Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints on generated text" +--- + +Counts sentences in text and optionally validates against a min/max range. Useful for enforcing structural constraints on generated text. + + + +```python Python +result = evaluator.evaluate( + eval_templates="sentence_count", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "sentence_count", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/spearman-correlation.mdx b/src/pages/docs/evaluation/builtin/spearman-correlation.mdx new file mode 100644 index 00000000..00c1e8f7 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/spearman-correlation.mdx @@ -0,0 +1,53 @@ +--- +title: "Spearman Correlation: Built-in Evaluation" +description: "Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1)" +--- + +Computes Spearman rank correlation coefficient. Measures monotonic relationship between two sets of values (-1 to 1, normalized to 0-1). + + + +```python Python +result = evaluator.evaluate( + eval_templates="spearman_correlation", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "spearman_correlation", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/squad-score.mdx b/src/pages/docs/evaluation/builtin/squad-score.mdx new file mode 100644 index 00000000..1eb61a8c --- /dev/null +++ b/src/pages/docs/evaluation/builtin/squad-score.mdx @@ -0,0 +1,53 @@ +--- +title: "Squad Score: Built-in Evaluation" +description: "Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before comparing" +--- + +Computes SQuAD-style evaluation: exact match + token F1 for QA tasks. Normalizes text (lowercase, remove articles/punctuation) before comparing. + + + +```python Python +result = evaluator.evaluate( + eval_templates="squad_score", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "squad_score", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `RAG` diff --git a/src/pages/docs/evaluation/builtin/ssim.mdx b/src/pages/docs/evaluation/builtin/ssim.mdx new file mode 100644 index 00000000..fa340dea --- /dev/null +++ b/src/pages/docs/evaluation/builtin/ssim.mdx @@ -0,0 +1,53 @@ +--- +title: "Ssim: Built-in Evaluation" +description: "Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, contrast, and structure. Score ranges from..." +--- + +Computes Structural Similarity Index (SSIM) between two images. Measures perceptual similarity based on luminance, contrast, and structure. Score ranges from 0 (completely different) to 1 (identical). + + + +```python Python +result = evaluator.evaluate( + eval_templates="ssim", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "ssim", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Image`, `NLP Metrics` diff --git a/src/pages/docs/evaluation/builtin/starts-with.mdx b/src/pages/docs/evaluation/builtin/starts-with.mdx new file mode 100644 index 00000000..0c345191 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/starts-with.mdx @@ -0,0 +1,50 @@ +--- +title: "Starts With: Built-in Evaluation" +description: "Checks if text begins with specific substring" +--- + +Checks if text begins with specific substring. + + + +```python Python +result = evaluator.evaluate( + eval_templates="starts_with", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "starts_with", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/step-count.mdx b/src/pages/docs/evaluation/builtin/step-count.mdx new file mode 100644 index 00000000..f16058cd --- /dev/null +++ b/src/pages/docs/evaluation/builtin/step-count.mdx @@ -0,0 +1,50 @@ +--- +title: "Step Count: Built-in Evaluation" +description: "Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, maximum, or range" +--- + +Counts and validates the number of steps/actions in an agent trajectory. Can check against exact count, minimum, maximum, or range. + + + +```python Python +result = evaluator.evaluate( + eval_templates="step_count", + inputs={ + "output": "The capital of France is Paris." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "step_count", + { + output: "The capital of France is Paris." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Agents`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/syntax-validation.mdx b/src/pages/docs/evaluation/builtin/syntax-validation.mdx new file mode 100644 index 00000000..ecad73bc --- /dev/null +++ b/src/pages/docs/evaluation/builtin/syntax-validation.mdx @@ -0,0 +1,50 @@ +--- +title: "Syntax Validation: Built-in Evaluation" +description: "Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket validation. Useful for checking if LLM-genera..." +--- + +Validates code syntax without executing it. Supports Python (via ast.parse), JSON, and basic JavaScript bracket validation. Useful for checking if LLM-generated code is syntactically correct. + + + +```python Python +result = evaluator.evaluate( + eval_templates="syntax_validation", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "syntax_validation", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Code`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/tool-call-accuracy.mdx b/src/pages/docs/evaluation/builtin/tool-call-accuracy.mdx new file mode 100644 index 00000000..2618677f --- /dev/null +++ b/src/pages/docs/evaluation/builtin/tool-call-accuracy.mdx @@ -0,0 +1,53 @@ +--- +title: "Tool Call Accuracy: Built-in Evaluation" +description: "Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and arguments, supporting both OpenAI and generi..." +--- + +Evaluates accuracy of agent tool/function calls by comparing actual vs expected calls. Checks function names and arguments, supporting both OpenAI and generic tool call formats. Scores full matches (name+args) at 1.0 and name-only matches at 0.5. + + + +```python Python +result = evaluator.evaluate( + eval_templates="tool_call_accuracy", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "tool_call_accuracy", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Agents`, `Code` diff --git a/src/pages/docs/evaluation/builtin/trajectory-match.mdx b/src/pages/docs/evaluation/builtin/trajectory-match.mdx new file mode 100644 index 00000000..2b49b349 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/trajectory-match.mdx @@ -0,0 +1,53 @@ +--- +title: "Trajectory Match: Built-in Evaluation" +description: "Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected in actual), and superset (actual in expecte..." +--- + +Validates agent action/tool call sequences. Supports strict (same order), unordered (any order), subset (expected in actual), and superset (actual in expected) matching modes. + + + +```python Python +result = evaluator.evaluate( + eval_templates="trajectory_match", + inputs={ + "output": "The capital of France is Paris.", + "expected": "Paris" + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "trajectory_match", + { + output: "The capital of France is Paris.", + expected: "Paris" + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `output` | `string` | The output. | +| | `expected` | `string` | The expected. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Agents`, `Output Validation` diff --git a/src/pages/docs/evaluation/builtin/translation-edit-rate.mdx b/src/pages/docs/evaluation/builtin/translation-edit-rate.mdx new file mode 100644 index 00000000..ab05f0af --- /dev/null +++ b/src/pages/docs/evaluation/builtin/translation-edit-rate.mdx @@ -0,0 +1,53 @@ +--- +title: "Translation Edit Rate: Built-in Evaluation" +description: "Computes Translation Edit Rate (TER). TER measures the minimum number of edits (insertions, deletions, substitutions) needed to transform the hypothesis into..." +--- + +Computes Translation Edit Rate (TER). TER measures the minimum number of edits (insertions, deletions, substitutions) needed to transform the hypothesis into the reference, normalized by reference length. Returns 1-TER (higher=better). + + + +```python Python +result = evaluator.evaluate( + eval_templates="translation_edit_rate", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "translation_edit_rate", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/type-token-ratio.mdx b/src/pages/docs/evaluation/builtin/type-token-ratio.mdx new file mode 100644 index 00000000..4c33b19c --- /dev/null +++ b/src/pages/docs/evaluation/builtin/type-token-ratio.mdx @@ -0,0 +1,50 @@ +--- +title: "Type Token Ratio: Built-in Evaluation" +description: "Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. Measures lexical diversity" +--- + +Computes Type-Token Ratio (TTR): unique tokens divided by total tokens. Measures lexical diversity. + + + +```python Python +result = evaluator.evaluate( + eval_templates="type_token_ratio", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "type_token_ratio", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Text` diff --git a/src/pages/docs/evaluation/builtin/word-count-in-range.mdx b/src/pages/docs/evaluation/builtin/word-count-in-range.mdx new file mode 100644 index 00000000..07f69af2 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/word-count-in-range.mdx @@ -0,0 +1,50 @@ +--- +title: "Word Count In Range: Built-in Evaluation" +description: "Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated responses (e.g., summaries, tweets, abs..." +--- + +Checks if the word count of text falls within a specified range. Useful for enforcing length constraints on generated responses (e.g., summaries, tweets, abstracts). + + + +```python Python +result = evaluator.evaluate( + eval_templates="word_count_in_range", + inputs={ + "text": "Hello, this is a sample text." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "word_count_in_range", + { + text: "Hello, this is a sample text." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `text` | `string` | The text. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `Output Validation`, `Text` diff --git a/src/pages/docs/evaluation/builtin/word-error-rate.mdx b/src/pages/docs/evaluation/builtin/word-error-rate.mdx new file mode 100644 index 00000000..c36589c5 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/word-error-rate.mdx @@ -0,0 +1,53 @@ +--- +title: "Word Error Rate: Built-in Evaluation" +description: "Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between reference and hypothesis transcriptions. Retu..." +--- + +Computes Word Error Rate (WER) for ASR/STT evaluation. WER measures the edit distance at the word level between reference and hypothesis transcriptions. Returns 1-WER as score (higher=better). + + + +```python Python +result = evaluator.evaluate( + eval_templates="word_error_rate", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "word_error_rate", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Audio` diff --git a/src/pages/docs/evaluation/builtin/word-info-lost.mdx b/src/pages/docs/evaluation/builtin/word-info-lost.mdx new file mode 100644 index 00000000..4db58f19 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/word-info-lost.mdx @@ -0,0 +1,53 @@ +--- +title: "Word Info Lost: Built-in Evaluation" +description: "Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score" +--- + +Computes Word Information Lost (WIL) for speech. WIL = 1 - (hits/ref * hits/hyp). Returns 1-WIL as score. + + + +```python Python +result = evaluator.evaluate( + eval_templates="word_info_lost", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "word_info_lost", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Audio` diff --git a/src/pages/docs/evaluation/builtin/word-info-preserved.mdx b/src/pages/docs/evaluation/builtin/word-info-preserved.mdx new file mode 100644 index 00000000..365edb65 --- /dev/null +++ b/src/pages/docs/evaluation/builtin/word-info-preserved.mdx @@ -0,0 +1,53 @@ +--- +title: "Word Info Preserved: Built-in Evaluation" +description: "Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better" +--- + +Computes Word Information Preserved (WIP) for speech. WIP = (hits/ref) * (hits/hyp). Higher = better. + + + +```python Python +result = evaluator.evaluate( + eval_templates="word_info_preserved", + inputs={ + "reference": "The capital of France is Paris.", + "hypothesis": "Paris is the capital of France." + }, +) + +print(result.eval_results[0].output) +print(result.eval_results[0].reason) +``` + +```typescript JS/TS +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "word_info_preserved", + { + reference: "The capital of France is Paris.", + hypothesis: "Paris is the capital of France." + } +); + +console.log(result); +``` + + + +| **Input** | | | | +| ------ | --------- | ---- | ----------- | +| | **Required Input** | **Type** | **Description** | +| | `reference` | `string` | The reference. | +| | `hypothesis` | `string` | The hypothesis. | + +| **Output** | | | +| ------ | ----- | ----------- | +| | **Field** | **Description** | +| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. | +| | **Reason** | A plain-language explanation of the verdict. | + +**Tags:** `NLP Metrics`, `Audio` diff --git a/src/pages/docs/evaluation/concepts/composite-evals.mdx b/src/pages/docs/evaluation/concepts/composite-evals.mdx new file mode 100644 index 00000000..a0dc2b08 --- /dev/null +++ b/src/pages/docs/evaluation/concepts/composite-evals.mdx @@ -0,0 +1,116 @@ +--- +title: "Composite Evals: Combining Multiple Checks Into One Score" +description: "How to bundle several eval templates into a single composite, the five aggregation functions, and when to use a safety gate vs an average." +--- + +## About + +A composite eval runs several child eval templates against the same row and combines them into one verdict. Use it when "good" is a combination of independent checks: an agent response should be helpful **and** factually grounded **and** polite, and you want one number that summarises all three. + +A composite is just a template like any other. You apply it to a dataset, trace project, or simulation the same way. The only difference is what's inside. + +--- + +## Anatomy of a composite + +| Setting | What it controls | +|---|---| +| **Children** | The list of eval templates to run for each row. | +| **Child axis** | The output type all children must share: `Pass/fail`, `Scoring`, or `Choices`. Locked once the first child is added so children stay comparable. | +| **Aggregation** | Toggle: when on, the composite produces a single combined score. When off, it just runs each child independently and reports them side by side. | +| **Aggregation function** | When aggregation is on, how to combine: `Weighted Average`, `Average`, `Minimum (safety gate)`, `Maximum`, or `Pass Rate`. | +| **Weight per child** | A numeric weight used by `Weighted Average`. | + +--- + +## The five aggregation functions + +Each function turns N child scores into one composite score. Pick the one that matches what "the row passed" should mean. + +| Function | What it does | When to use | +|---|---|---| +| **Weighted Average** | `sum(score × weight) / sum(weights)` for each child. | When some checks are more important than others. | +| **Average** | Simple mean of all child scores. | When every check counts equally. | +| **Minimum (safety gate)** | Composite score equals the lowest child score. | When any single failure should fail the whole row. | +| **Maximum** | Composite score equals the highest child score. | Rare. When you only need at least one check to pass. | +| **Pass Rate** | Fraction of children that met their own pass threshold. | When you want a percentage like "this row passed 4 of 5 checks." | + +### When to pick Weighted Average + +Use this when checks are not equally important. For example, a customer-support quality composite might weight `Helpfulness` and `Factually Grounded` heavily, and `Tone` lightly: + +``` +Helpfulness: weight 3 +Factually Grounded: weight 3 +Tone: weight 1 +``` + +A row that fails on Tone but passes Helpfulness and Grounded still scores well; a row that fails on Helpfulness drops sharply. + +### When to pick Minimum + +Use this when a single failure should fail the row. For example, a safety composite where any unsafe content rules the row out: + +``` +Toxicity: Pass/fail +PII Detection: Pass/fail +Prompt Injection: Pass/fail +``` + +If the toxicity check fails, the composite fails, regardless of the others. This is the "safety gate" pattern. + +### When to pick Pass Rate + +Use this when you want a percentage like "this row passed 4 of 5 checks" rather than a single composite score. The result is `(children that passed) / (total children)`. This stays meaningful as you add or remove children. + +--- + +## The child axis + +All children of a composite must share the same output type so their results can be compared. The **Child axis** setting picks that shared type: + +- **Pass/fail axis:** All children must be `Pass/fail` templates. Aggregation produces a pass count or pass rate. +- **Scoring axis:** All children must be `Scoring` templates. Aggregation works on the numeric scores. +- **Choices axis:** All children must be `Choices` templates with the same label set. + +The axis is locked once you add the first child. To change it, remove all children first. + +--- + +## Aggregation off vs on + +The Aggregation toggle decides whether the composite produces one number or just a panel of independent results. + +- **Aggregation on:** The composite returns one verdict per row, computed from the children using the chosen function. This is the typical setup. +- **Aggregation off:** The composite runs each child and surfaces them all individually. Useful when you want to bundle related checks for convenience (one click to apply, instead of N clicks) without forcing a combined score. + +--- + +## When to use a composite + +- The judgment has multiple independent dimensions that don't fit in one prompt +- A safety gate where any failure means the row fails +- A pass-rate metric across a fixed battery of checks +- You want to apply the same set of evals everywhere with one click + +When **not** to use one: + +- The dimensions are correlated and a single LLM-As-A-Judge eval with a richer rubric can decide. Composite costs N evals per row; one well-written eval costs one. +- The children don't share an output type. Composite requires a single axis. + +--- + +## Pinned versions + +Each child can be pinned to a specific [version](/docs/evaluation/concepts/eval-templates#versioning) of its template. This keeps the composite stable when child templates change. New versions of a child won't affect the composite until you explicitly update the pin. + +This is the right default for production composites where you want the behaviour locked. + +--- + +## Next Steps + +- [Eval templates](/docs/evaluation/concepts/eval-templates): The composite vs single distinction. +- [Output types](/docs/evaluation/concepts/output-types): The output type sets the child axis. +- [Eval results](/docs/evaluation/concepts/eval-results): How composite results aggregate. +- [Create custom evals](/docs/evaluation/features/custom): Build your own composite. diff --git a/src/pages/docs/evaluation/concepts/data-injection.mdx b/src/pages/docs/evaluation/concepts/data-injection.mdx new file mode 100644 index 00000000..df3037df --- /dev/null +++ b/src/pages/docs/evaluation/concepts/data-injection.mdx @@ -0,0 +1,125 @@ +--- +title: "Data Injection: Giving an Eval More Than Variables" +description: "How the Context setting lets an eval see the dataset row, span attributes, the full trace tree, the conversation history, or the call transcript in addition to your mapped variables." +--- + +## About + +By default an eval sees only the `{{variables}}` you mapped. That's the right starting point: it keeps the eval focused, predictable, and cheap to run. + +But sometimes the judgment needs more. If you're checking whether an agent answered a customer's question correctly, the judge might need the full conversation, not just the last reply. If you're scoring a span in a trace, the judge might need to see the parent span's input. The **Context** setting on a template controls how much extra data the eval gets in addition to your variables. + +This setting applies to [Agent evals](/docs/evaluation/concepts/eval-types#agents) and is a major reason to pick Agents over LLM-As-A-Judge. + +--- + +## The six context options + +The Context selector on the eval create page offers six options. You can pick any combination. + +| Option | What gets injected | +|---|---| +| **Template variables** | Only the `{{variables}}` you mapped. This is the default. | +| **Dataset row context** | All columns from the current row. | +| **Call context** | The call transcript, recording, and scenario. Used when scoring a simulation call. | +| **Full span context** | The complete span data and metadata. Used when scoring a span in a trace. | +| **Trace context** | The full trace tree with every span. | +| **Session context** | The full conversation history across multiple traces. | + +`Template variables` is always on. The other five are additive: turning on `Dataset row context` doesn't replace your mapped variables, it adds the rest of the row alongside them. + +{/* SCREENSHOT NEEDED: The Context dropdown on the Agents tab of the eval create page, opened to show all six options (Template variables checked by default, plus the five additional options). */} + +--- + +## When to use each + +### Template variables (default) + +Pick this for any check that fits in a small number of clearly named inputs. It's the right default because: + +- The eval is cheaper (less to read). +- The judgment is reproducible (same inputs every time). +- You can swap the data source (dataset → trace → simulation) by changing only the mapping. + +If you can answer "what does the eval need?" with three or four field names, leave Context on `Template variables`. + +### Dataset row context + +Use when the row has dozens of columns and the judge needs to see them all to decide. For example, a compliance check that has to consider customer tier, region, channel, and product type alongside the response. + +This is also the right pick when the same eval needs to work across datasets that have different column structures: the judge gets everything and decides what's relevant. + +### Call context + +Use when scoring a simulated agent call. The judge gets the call transcript, the call's audio recording (where applicable), and the scenario the agent was asked to handle. Used in evals like `Customer Agent Conversation Quality`, `Loop Detection`, and `Termination Handling`. + +### Full span context + +Use when scoring a single span in a tracing project and the judgment depends on attributes that aren't in the span's input or output (for example, the model name, the temperature, the latency, or the error code). Including the full span gives the eval the full picture without you having to map every attribute as a separate variable. + +### Trace context + +Use when the verdict depends on the whole trace, not just one span. For example: "did the agent eventually complete the task" requires seeing every step the agent took, not just the final response. With trace context the eval can read every span in the trace. + +### Session context + +Use when the verdict depends on the full conversation across multiple traces. For example: "did the support agent solve the customer's problem in this session" needs to see every turn, even ones that happened in earlier traces. + +--- + +## How context interacts with variables + +Variables and context are not either/or. The eval gets your mapped variables **and** whichever context options are on. The criteria you write can refer to the variables explicitly with `{{variable_name}}` and refer to the wider context implicitly ("read the trace and decide ..."). + +A common pattern: + +- Map the specific fields the eval should focus on as variables (so the criteria can reference them precisely). +- Turn on a context option to give the judge the surrounding picture (so it can resolve ambiguity). + +For example, an Agent eval that scores a final agent response: + +- Variables: `final_response` +- Context: `Trace context` +- Criteria: "Decide whether `{{final_response}}` resolves the user's request, considering the full conversation in the trace." + +The judge focuses on the final response (the variable) but uses the trace to interpret what "resolves the user's request" means in context. + +--- + +## Cost and reliability trade-offs + +More context costs more. A judge that reads the full trace pays in tokens for every span, every input, every output, on every row. A judge that reads only your variables pays for what you mapped. + +Trade-offs: + +- **Latency.** A larger payload takes longer to process. +- **Token cost.** More tokens in, more tokens of reasoning out. +- **Reliability.** A judge with too much context can get distracted or miss the actual question. Less context gives a more focused verdict. + +Add context one option at a time, only when you can show the simpler version produces wrong verdicts. Start with `Template variables`, run a sample, look at the failures. If they're failing because the judge couldn't see something it needed, turn on the smallest context option that gives it that thing. + +--- + +## Where the context comes from + +The available context options depend on where the eval runs. + +| Surface | Available context options | +|---|---| +| **Dataset** | Template variables, Dataset row context | +| **Trace project** | Template variables, Full span context, Trace context, Session context | +| **Simulation** | Template variables, Call context | +| **Eval Playground** | Template variables, plus whichever surface you're testing against | +| **SDK standalone** | Template variables only | + +The eval template's Context setting acts as the maximum. If a template asks for `Trace context`, it can run on a trace project but not on a dataset (datasets don't have traces). If a template asks for `Template variables` only, it can run anywhere. + +--- + +## Next Steps + +- [Eval types](/docs/evaluation/concepts/eval-types): Context is an Agent setting; LLM-As-A-Judge and Code use only mapped variables. +- [Eval templates](/docs/evaluation/concepts/eval-templates): Where the Context setting lives on a template. +- [Create custom evals](/docs/evaluation/features/custom): Configure context when authoring an Agent eval. +- [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Apply an eval to a dataset, trace project, or simulation. diff --git a/src/pages/docs/evaluation/concepts/eval-results.mdx b/src/pages/docs/evaluation/concepts/eval-results.mdx index d11d1840..9414fc6e 100644 --- a/src/pages/docs/evaluation/concepts/eval-results.mdx +++ b/src/pages/docs/evaluation/concepts/eval-results.mdx @@ -1,66 +1,132 @@ --- -title: "Eval Results: Reading and Storing Evaluation Outputs" -description: "Understand what evaluation results contain, how to read them, and how results are stored and aggregated across runs in Future AGI." +title: "Eval Results: What an Eval Returns and Where It Goes" +description: "What an eval result contains, the three output types and what each one returns, and how results are stored across datasets, traces, simulations, and the SDK." --- ## About -Every evaluation run produces a result for each row or call that was scored. A result tells you whether the response passed the criteria, how it scored, and why the judge made that decision. Results are stored alongside your data so you can review them, compare across runs, and track quality over time. +Every evaluation produces one result per row, span, call, or input. A result tells you the verdict, why the judge reached it, and how long it took. Results are stored alongside the data they evaluated so you can review individual rows or aggregate across runs. + +A result has the same shape regardless of the [eval type](/docs/evaluation/concepts/eval-types) (Agents, LLM-As-A-Judge, or Code) that produced it. --- ## What a result contains -Each individual result has three parts: - -| Field | Description | +| Field | What it is | |---|---| -| **Output** | The result value: `1.0` (pass), `0.0` (fail), a score between 0 and 100, or a category label depending on the template's output type | -| **Reason** | A plain-language explanation from the judge describing why it assigned that result | -| **Eval ID** | A unique identifier for the eval run, used to retrieve async results | +| **Value** | The verdict. Format depends on the [output type](/docs/evaluation/concepts/output-types) (see below). | +| **Reason** | A plain-language explanation of the verdict. Empty for Code evals that don't produce one. | +| **Runtime** | How long the eval took for this row, in seconds. | +| **Model** | Which model produced the verdict (for Agent and LLM-As-A-Judge evals). | +| **Status** | `Completed`, `Failed`, or `Running` for in-progress async evals. | -The reason field is especially useful for diagnosing failures. Instead of reviewing each response manually, you can read the reason to understand exactly what caused a pass or fail judgment. +The reason is the part you read when something fails. Instead of clicking through every row, you scan reasons to find the pattern. --- -## Output types +## Result format by output type + +The shape of `Value` depends on the eval template's output type. + +### Pass/fail + +The value is a string: `"Passed"` or `"Failed"`. + +For an Agent or LLM-As-A-Judge eval, the verdict is the model's decision. For a Code eval, it comes from a `True` or `False` return. + +``` +Value: "Passed" +Reason: "The response addresses the user's question with no harmful content." +``` + +### Scoring + +The value is a label mapped to a numeric score 0-1. The label is what the judge picked from the choice set you defined; the score is the value you assigned to that label. + +The numeric score is what gets aggregated and what the pass threshold compares against. Use the label when you need the human-readable answer. + +``` +Value: { "choice": "Mostly grounded", "score": 0.7 } +Reason: "The response cites the source for the dates and names but adds an unrelated detail at the end." +``` + +For Code evals, returning a number 0-1 produces a Scoring result. The platform shows the number; the label is omitted. + +### Choices + +The value is the label the judge picked from the choice set. Each label is marked Pass, Neutral, or Fail when you create the template, and that mark determines whether the row passes overall. + +Single-choice example: -| Output type | What it looks like | When to use | -|---|---|---| -| **Pass/Fail** | `1.0` for pass, `0.0` for fail | Binary checks: toxicity, PII, format validation | -| **Score (percentage)** | A number between 0 and 100 | Graded quality: groundedness, relevance, completeness | -| **Deterministic choices** | A category label from a predefined set | Classification: tone, language, intent | +``` +Value: { "choice": "Formal", "score": 1.0 } +Reason: "The response uses complete sentences and avoids contractions." +``` -The output type is defined by the eval template. Custom templates let you configure which type to use when you create them. +Multi-choice example (when the template allows multiple labels): + +``` +Value: { "choice": ["Friendly", "Concise"], "score": 1.0 } +Reason: "The response is short and uses warm phrasing." +``` --- ## Where results are stored -**In a dataset**: Results appear as new columns, one per eval. Each row shows the result value and reason for that row. You can add multiple evals to the same dataset and see all results side by side. +Results are stored where the eval runs. The shape is the same; the location differs. -**Via SDK**: Results are returned directly from `evaluator.evaluate()`. Access them via `result.eval_results[0].output` and `result.eval_results[0].reason`. +| Surface | Where the result lives | +|---|---| +| **Dataset** | A new column on the dataset, one cell per row. A second column holds the reason if you enabled the reason column. | +| **Trace project** | An eval log on the span, trace, or session that was scored. Visible on the span detail page and in trace charts. | +| **Simulation** | On the call execution. Visible in the call detail view and the run summary. | +| **Experiment** | A new column per variant in the experiment grid, one cell per row per variant. | +| **Playground** | Returned in the response and shown in the test panel. Not persisted. | +| **SDK** | Returned to the caller. For async runs, returned via the eval ID once the run completes. | -**Async runs**: For long-running or large-batch runs, the SDK returns an `eval_id` immediately. Use `evaluator.get_eval_result(eval_id)` to retrieve results when the run completes. +The same template can run on multiple surfaces and produce comparable results, since the value format is the same everywhere. --- ## Aggregates and KPIs -When you run evals on a dataset, Future AGI aggregates results across all rows: +When evals run on more than one row, the platform aggregates results. + +| Output type | Aggregate it shows | +|---|---| +| **Pass/fail** | Pass rate (percentage of rows that passed) | +| **Scoring** | Average score, distribution across labels | +| **Choices** | Distribution across labels (with single-choice) or label co-occurrence (with multi-choice) | + +For composite evals, the aggregate is computed using the [aggregation function](/docs/evaluation/concepts/eval-templates#single-vs-composite) you set on the template (`Weighted average`, `Average`, `Min`, `Max`, or `Pass rate`). + +Aggregates appear in the dataset evaluation summary, the trace project's eval charts, and the experiment comparison view. + +--- + +## Reading results + +A few things to keep in mind when reading individual results: + +- **The reason cites what the eval saw.** For Agent evals that searched a knowledge base or the internet, the reason includes what the eval found. For Code evals, the reason is whatever your code returned (often empty). +- **A failed status is different from a failing verdict.** Status `Failed` means the eval itself errored (the model timed out, the code threw, the input was invalid). A pass/fail verdict of `"Failed"` means the eval ran and the answer didn't pass. +- **The score is what aggregates use.** When the value is a `{ choice, score }` object, summary stats use the score, not the label. The label is for humans reading individual rows. + +--- + +## Async results -- **Pass rate**: percentage of rows that passed, for pass/fail templates -- **Average score**: mean score across all rows, for percentage templates -- **Distribution**: breakdown of results across categories, for deterministic templates -- **Trend data**: how results change across runs over time +For long-running evals (large datasets, batch jobs, CI/CD runs), the eval is queued and the SDK returns an eval ID immediately. Use the ID to check status and retrieve results when the run completes. -These aggregates appear in the evaluation summary view and are tracked per eval template per dataset run, giving you a versioned history of quality changes. +This is also how you resume a long run from a different process: store the eval ID, then fetch the result later. --- -## Next steps +## Next Steps +- [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices in detail, including how scoring labels map to numeric scores. +- [Eval templates](/docs/evaluation/concepts/eval-templates): Where the output type comes from and how composite evals aggregate child results. - [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Run an eval and see results. -- [Eval templates](/docs/evaluation/concepts/eval-templates): How templates define what output type a result uses. -- [Judge models](/docs/evaluation/concepts/judge-models): How the judge produces the result and reason. - [CI/CD pipeline](/docs/evaluation/features/cicd): Track results by version across deploys. diff --git a/src/pages/docs/evaluation/concepts/eval-templates.mdx b/src/pages/docs/evaluation/concepts/eval-templates.mdx index 4201fec8..3ed5884a 100644 --- a/src/pages/docs/evaluation/concepts/eval-templates.mdx +++ b/src/pages/docs/evaluation/concepts/eval-templates.mdx @@ -1,71 +1,121 @@ --- -title: "Eval Templates: Built-in and Custom Evaluation Templates" -description: "Explains what evaluation templates are, the difference between built-in and custom templates, and how output types determine what an eval returns." +title: "Eval Templates: Built-in, Custom, Composite, and Versioned" +description: "What eval templates are, the difference between built-in and custom, single vs composite, and how versioning lets you change a template without breaking running evals." --- ## About -An eval template is the definition of what to measure. It contains the criteria the judge model will apply to each response and specifies what kind of result to return. You create a template once and reuse it across any dataset, simulation, experiment, or SDK call. +An eval template defines what to measure. It holds the criteria, the type ([Agents, LLM-As-A-Judge, or Code](/docs/evaluation/concepts/eval-types)), the input variables it expects, the model to use, and the output type. You create a template once and reuse it across any dataset, simulation, experiment, trace project, or SDK call. -Templates are the reusable unit of evaluation logic. Whether you're checking for toxicity, verifying that a response stays grounded in a source document, or enforcing a company-specific rule, the logic lives in the template. +Templates are the unit you share. Two datasets that use Toxicity reference the same template; mappings and overrides are stored separately for each use. --- -## Built-in vs custom templates +## Built-in vs custom | | Built-in | Custom | |---|---|---| -| **Who writes the criteria** | Future AGI | You | -| **How to access** | Select from the template list in the UI or pass the name to the SDK | Create via UI or API, then use by name | -| **Covers** | 70+ categories: quality, safety, factuality, RAG, bias, format, audio, image | Any domain-specific, business, or regulatory rule you define | -| **Required inputs** | Defined per template (e.g. `input`, `output`, `context`) | You define the required keys in the template config | +| **Authored by** | Future AGI | You or your team | +| **Where it lives** | Available in every workspace | Scoped to your workspace | +| **Editable** | Read-only (you can duplicate and edit the copy) | Fully editable | +| **Coverage** | 70+ templates across quality, safety, factuality, RAG retrieval, format, bias, audio, image | Anything you can express as instructions, code, or an agent rubric | +| **Mapping** | You map your data to the template's required keys | You define the keys when you write the template | -See [Built-in evals](/docs/evaluation/builtin) for the full list of available templates. +See [Built-in evals](/docs/evaluation/builtin) for the full list. See [Create custom evals](/docs/evaluation/features/custom) for how to author your own. -See [Create custom evals](/docs/evaluation/features/custom) for how to write your own. +--- + +## Single vs composite + +A template is either a **single** eval or a **composite** that combines multiple children into one result. + +### Single + +Runs one check and returns one result per row. This is the default. + +### Composite + +Runs several child evals and aggregates them into a single score. Useful when "good" is a combination of independent checks (for example, an agent response should be helpful **and** polite **and** factually grounded). + +What you configure on a composite template: + +| Setting | What it controls | +|---|---| +| **Children** | The eval templates to run for each row. Add as many as you need. | +| **Child axis** | The output type all children must share: `Pass/fail`, `Scoring`, or `Choices`. Locked once the first child is added so children stay comparable. | +| **Aggregation** | Whether to combine child results or just run them independently and report each. | +| **Aggregation function** | When aggregation is on, how to combine: `Weighted average`, `Average`, `Min`, `Max`, or `Pass rate` (fraction passing). | +| **Weight per child** | Numeric weight used by `Weighted average`. | + +**Use composite when:** +- One overall score should reflect several independent quality dimensions +- You want a safety gate (`Min` aggregation across hard checks) +- You want a pass rate across a fixed battery of checks --- ## Output types -Every template returns one of three output types: +Every template returns one of three output types. The type is set when you create the template and determines what the result looks like. -| Type | Description | Example | -|---|---|---| -| **Pass/Fail** | Binary result: 1.0 for pass, 0.0 for fail | Toxicity check: passed or failed | -| **Score (percentage)** | Numeric value between 0 and 100 | Groundedness: 87 out of 100 | -| **Deterministic choices** | Categorical result from a defined set of options | Tone classification: `formal`, `informal`, `neutral` | +| Output type | UI label | What you define | What the eval returns per row | +|---|---|---|---| +| `pass_fail` | **Pass/fail** | Nothing extra | `"Passed"` or `"Failed"`, plus a reason | +| `percentage` | **Scoring** | Choice labels mapped to scores 0-1 (e.g. `Excellent: 1.0, Average: 0.5, Poor: 0.0`) and a pass threshold | A label, the numeric score it maps to, and a reason | +| `deterministic` | **Choices** | Choice labels each marked Pass / Fail / Neutral, plus optional multi-choice toggle | One or more labels, plus a reason | + + +"Scoring" is choice-based now: you define a small set of labelled levels and their numeric values, rather than asking the judge for an arbitrary number. This produces more consistent results across runs. + -Every result also includes a **reason**: a plain-language explanation of why the judge assigned that result. This makes it possible to understand failures without reviewing each response manually. +See [Output types](/docs/evaluation/concepts/output-types) for full details, including how to set choice scores and the pass threshold. --- ## Required keys and input mapping -Templates declare the input keys they expect. For example, a groundedness template might require `output` (the model response) and `context` (the source document). When you run an eval, you map your actual data to these keys. +Templates declare the input keys they expect, written as `{{variable_name}}` in the criteria, prompt, or code. -**In the UI**: When you add a template to a dataset or simulation, the platform shows a mapping form. You select which column corresponds to each required key. +When you use a template, you map your actual data to those keys. -**In the SDK**: Pass a dict where the keys match what the template expects: +**Example.** A `groundedness` template requires `output` and `context`. To run it on a dataset, you pick which dataset column maps to `output` and which to `context`. To run it on a trace project, you map to span attributes instead. -```python -result = evaluator.evaluate( - template=Groundedness(), - input={ - "output": "The Eiffel Tower is in Paris.", - "context": "The Eiffel Tower is a wrought-iron lattice tower in Paris, France.", - }, -) -``` +The mapping is stored per use, not on the template. The same template can map to different columns in different datasets, or to different span attributes in different trace projects, without you copying the template. + +For built-in templates the required keys are documented on each template's page. For custom templates the keys are anything you write inside `{{ }}` in your criteria. + +--- + +## Versioning + +Templates are versioned. Each save creates a new immutable snapshot, and one version is the default. + +| Action | What it does | +|---|---| +| **Save** | Updates the template and creates a new version snapshot. | +| **Set as default** | Marks a version as the one new uses pick up by default. | +| **Restore** | Copies a previous version back into the current template. | + +This means you can change a template's criteria, model, or settings without breaking running evals. Existing dataset evals, trace evals, and SDK calls continue using whatever version they pinned to. New uses pick up the default. + +--- + +## System vs custom ownership + +| | System | Custom | +|---|---|---| +| **Visible to** | Every workspace | Your workspace | +| **Editable** | No (you can duplicate to make a custom copy) | Yes | +| **Updates over time** | Future AGI updates these as the platform improves | You decide when to change them | -Built-in templates have fixed required keys documented in the template reference. Custom templates let you define any keys using `{{variable_name}}` placeholders in the rule prompt: the key names you use in the prompt become the required keys you must supply at run time. +System templates are the built-ins. Custom templates are anything you create or duplicate. --- -## Next steps +## Next Steps - [Built-in evals](/docs/evaluation/builtin): Full list of available templates with required keys and output types. -- [Create custom evals](/docs/evaluation/features/custom): Write your own criteria and rule prompts. -- [Eval types](/docs/evaluation/concepts/eval-types): LLM as Judge, Deterministic, Statistical Metric, and LLM as Ranker. -- [Judge models](/docs/evaluation/concepts/judge-models): How the model applies a template to produce a result. -- [Eval results](/docs/evaluation/concepts/eval-results): What the output of an eval run looks like. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. +- [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices in detail. +- [Create custom evals](/docs/evaluation/features/custom): Author your own template. +- [Eval results](/docs/evaluation/concepts/eval-results): What the output of an eval run looks like and where it goes. diff --git a/src/pages/docs/evaluation/concepts/eval-types.mdx b/src/pages/docs/evaluation/concepts/eval-types.mdx index c00c3a98..9a9e5170 100644 --- a/src/pages/docs/evaluation/concepts/eval-types.mdx +++ b/src/pages/docs/evaluation/concepts/eval-types.mdx @@ -1,122 +1,136 @@ --- -title: "Eval Types: Four Evaluation Methods in Future AGI" -description: "The four evaluation methods in Future AGI: LLM as Judge, Deterministic, Statistical Metric, and LLM as Ranker, and how modality affects which ones apply." +title: "Eval Types: Agents, LLM-As-A-Judge, Code" +description: "The three eval types in Future AGI, what each one can do, and how to pick the right one for your check." --- ## About -Every eval template in Future AGI uses one of four evaluation methods to produce a result. The method determines how the eval computes its output, whether a judge model is required, and what kind of result to expect. Choosing the right type for your use case gives you the right balance of accuracy, speed, and cost. +Every eval template in Future AGI is one of three types: **Agents**, **LLM-As-A-Judge**, or **Code**. The type determines what the eval can do at run time: whether it can call tools, look things up, follow a single prompt, or run as deterministic code. + +The type is set when you create the template and shown as tabs on the create page. + +| Type | What it runs | What it can use | Returns a reason | +|---|---|---|---| +| **Agents** | A multi-step evaluator that iterates over the input | Tools and MCP connectors, knowledge bases, internet, dataset row / span / trace / session context | Yes | +| **LLM-As-A-Judge** | One LLM call against a templated prompt | Few-shot examples, system + user message chain | Yes | +| **Code** | A Python or JavaScript function in a sandbox | Standard libraries; whatever you write | Optional, depends on what the code returns | --- -## LLM as Judge +## When to use each -The judge model reads the response, applies the template's criteria, and reasons about whether it passes. This is the most flexible type: it handles subjective, context-dependent, and nuanced quality checks that cannot be expressed as a fixed rule. +- Use **Agents** when a single prompt cannot decide on its own. The eval needs to consult a knowledge base, fetch fresh information from the internet, or reason over the full trace tree before judging. +- Use **LLM-As-A-Judge** when the criteria fit a single prompt and the model can decide from the inputs you map. This is the right default for quality, safety, tone, and groundedness checks. +- Use **Code** when the answer is computable: format validation, exact match, regex, length, n-gram overlap, or a custom scoring function you can write in Python or JavaScript. -**Requires a judge model.** Configure one in [Future AGI models](/docs/evaluation/features/futureagi-models) or [custom models](/docs/evaluation/features/custom-models). +--- -**Returns**: a result (pass/fail, score, or category) and a plain-language **reason** explaining the judgment. +## Agents -**Examples**: Groundedness, Toxicity, Task Completion, Tone, Detect Hallucination, Instruction Adherence, PII Detection, Context Adherence, and all custom evals. +An Agent eval runs as an iterative loop. On each step it can call configured tools, read the data you injected, and decide whether to keep going or return a verdict. This is the option to pick when a one-shot prompt cannot reach a confident answer. -**Best for:** -- Quality checks that require understanding meaning or intent -- Safety and policy enforcement -- RAG pipeline evaluation (context adherence, relevance, chunk attribution) -- Custom business or regulatory rules written in plain language +What you can configure when creating an Agent eval: ---- +| Setting | What it controls | +|---|---| +| **Mode** | `Auto` (balances quality and speed), `Agent` (deep, reasoning-based), `Quick` (runs fast, for quick iteration). | +| **Use Internet** | Lets the eval fetch up-to-date information during the run. | +| **Connectors** | MCP connectors the eval can call as tools. Configured in your workspace's Falcon AI connectors. | +| **Knowledge Bases** | Knowledge bases the eval can search. The eval cites what it retrieved in the reason. | +| **Context** | What payload the eval receives in addition to your `{{variables}}`. See [Data injection](/docs/evaluation/concepts/data-injection). | +| **Summary** | Whether the reason is `Short`, `Long`, `Concise`, or follows a custom rubric you write. | + +**Use it when:** +- Fact verification needs to look something up +- The judgment needs evidence from a curated knowledge base +- A compliance check spans multiple sources or steps +- Higher confidence is worth more time per row -## Deterministic / Rule-based +--- -Computed directly from the text using code or string logic. No model is called and no API key is required. Given the same input, always returns the same output. +## LLM-As-A-Judge -**Does not require a judge model.** Runs locally; works without an API key via the standalone `evaluate()` function. +A single LLM call against a prompt template. You write a system message and any user messages, drop in `{{variables}}` where the data goes, pick a model, and the eval is done. This is the most common type for built-in evals like Toxicity, Groundedness, Tone, Task Completion, and PII Detection. -**Returns**: pass/fail only. No reason field. +What you can configure: -**Examples**: Is JSON, Is Email, Contains Valid Link, No Invalid Links, One Line. +| Setting | What it controls | +|---|---| +| **Messages** | The system + user message chain the model sees. Use `{{variable_name}}` for any field that gets filled at run time. | +| **Model** | Which model judges. Future AGI ships `turing_large`, `turing_small`, and `turing_flash`. You can also bring your own model with [custom models](/docs/evaluation/features/custom-models). | +| **Few-shot examples** | Optional dataset of labelled examples to show the model the kind of judgment you want. | +| **Template format** | Mustache-style `{{variable}}` (default) or Jinja, if you need control flow. | -**Best for:** -- Format validation (valid JSON, email address, URL presence) -- High-volume pipelines where speed and zero API cost matter -- Offline or air-gapped environments -- First-pass filtering before running LLM-based evals +**Use it when:** +- One prompt is enough: the judge has everything it needs in the inputs you mapped. +- You want speed and predictable cost per row. +- The check is subjective but well-bounded (tone, factuality, helpfulness). --- -## Statistical Metric - -Computes a numeric score using an algorithm applied to the output and a reference value. Covers overlap metrics, edit distance, semantic similarity, and information retrieval metrics. No judge model is needed for most: embedding-based metrics call an embedding model, not a generative one. +## Code -**Returns**: a numeric score (e.g. 0–1 or 0–100). No reason field. +A Python or JavaScript function that runs in a sandbox. No model call. Every row gets the same logic, deterministically. -**Examples:** +What you can configure: -| Metric | What it measures | +| Setting | What it controls | |---|---| -| BLEU, ROUGE | N-gram overlap between output and reference | -| Levenshtein Similarity | Character edit distance between output and reference | -| Numeric Similarity | Numerical difference between output and reference | -| Embedding Similarity | Semantic vector similarity between output and reference | -| Semantic List Contains | Whether output contains phrases semantically similar to a reference list | -| Recall@K, Precision@K, NDCG@K, MRR, Hit Rate | Retrieval quality for RAG pipelines | -| FID Score | Distribution similarity between sets of real and generated images | -| CLIP Score | Alignment between an image and its text description | - -**Best for:** -- Benchmarking against a ground-truth reference answer -- RAG retrieval quality (recall, precision, ranking) -- Image generation quality -- Reproducible, model-free scoring +| **Language** | `Python` or `JavaScript`. | +| **Code** | Your function body in the in-page editor. The eval calls a `evaluate` function with the variables you mapped. | +| **Output type** | `Pass/fail` or `Scoring` based on what your code returns (boolean, number, or dict). See [Output types](/docs/evaluation/concepts/output-types). | ---- +**Use it when:** +- The check is a rule: format, regex, length, exact match, JSON schema. +- You need a metric: BLEU, ROUGE, embedding similarity, retrieval recall. +- The cost of an LLM call is not justified for a deterministic answer. +- You want zero variance across runs. -## LLM as Ranker +Future AGI ships dozens of built-in Code evals so you don't have to write them yourself: Contains, Equals, Regex, Is JSON, BLEU Score, ROUGE, Embedding Similarity, Recall@K, Precision@K, NDCG@K, MRR, CLIP Score, FID Score, and more. -A variant of LLM as Judge where instead of scoring a single response, the model ranks a set of retrieved context chunks based on relevance to a query. Used specifically for evaluating retrieval ordering in RAG pipelines. +--- -**Requires a judge model.** +## Choosing a type -**Returns**: a ranked score per context item. +Use this flow when you don't already know which type to pick: -**Examples**: Eval Ranking. +1. Can the answer be computed without a model? Use **Code**. +2. Can a single prompt decide if you give the model the right inputs? Use **LLM-As-A-Judge**. +3. Does the judgment need external lookup, tools, or reasoning over a full trace? Use **Agents**. -**Best for:** -- Evaluating whether a retrieval system surfaces the most relevant chunks at the top -- Diagnosing retrieval ordering issues in RAG pipelines +When several types could work, prefer Code first (cheapest, fastest, deterministic), then LLM-As-A-Judge, then Agents only if a single prompt is not enough. --- ## Modality -In addition to the four types above, evals also vary by the kind of input they accept: +Eval types and modality are independent. Any type can evaluate text. Image, audio, and conversation evals require a model that supports that modality. -| Modality | What it evaluates | Example evals | +| Modality | What it evaluates | Where it usually fits | |---|---|---| -| **Text** | Any text input or output | Most built-in evals | -| **Image** | Images passed as inputs | CLIP Score, FID Score, Caption Hallucination, Image Instruction Adherence, Synthetic Image Evaluator, OCR Evaluation | -| **Audio** | Audio files or speech | Audio Quality, Audio Transcription, TTS Accuracy | -| **Conversation** | Multi-turn conversation histories | Customer Agent evals (Loop Detection, Context Retention, Query Handling, etc.) | +| **Text** | Any text input or output | All three eval types | +| **Image** | Images passed as inputs (CLIP Score, FID Score, Caption Hallucination, Image Instruction Adherence, OCR Evaluation) | LLM-As-A-Judge or Code | +| **Audio** | Audio files or speech (Audio Quality, Audio Transcription, TTS Accuracy) | LLM-As-A-Judge with a multimodal judge | +| **Conversation** | Multi-turn conversation histories (Customer Agent evals, Loop Detection, Context Retention) | Agents or LLM-As-A-Judge | -Multimodal evals (image, audio, conversation) require a judge model that supports the relevant modality. Use `turing_large` or `turing_small` for image and audio inputs. +For multimodal evals, pick a judge model that supports the modality. `turing_large` supports text, image, and audio; `turing_small` and `turing_flash` support text and image. --- ## Quick reference -| Type | Judge model required | Returns reason | No API key possible | -|---|---|---|---| -| LLM as Judge | Yes | Yes | No | -| Deterministic | No | No | Yes | -| Statistical Metric | No (most) | No | Yes (most) | -| LLM as Ranker | Yes | No | No | +| Type | Calls a model | Calls tools / MCP | Uses knowledge base | Internet access | Returns reason | +|---|---|---|---|---|---| +| Agents | Yes | Yes | Yes | Yes | Yes | +| LLM-As-A-Judge | Yes | No | No | No | Yes | +| Code | No | No | No | No | Optional | --- -## Next steps +## Next Steps -- [Built-in evals](/docs/evaluation/builtin): Full list with evaluation method and required inputs for each template. -- [Create custom evals](/docs/evaluation/features/custom): Custom evals always use LLM as Judge. -- [Judge models](/docs/evaluation/concepts/judge-models): Choose the right model for LLM as Judge and LLM as Ranker evals. -- [Eval groups](/docs/evaluation): Combine different eval types and run them together in one pass. +- [Eval templates](/docs/evaluation/concepts/eval-templates): Built-in vs custom, single vs composite, versioning. +- [Output types](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices. +- [Data injection](/docs/evaluation/concepts/data-injection): How to give an eval more than just `{{variables}}`. +- [Create custom evals](/docs/evaluation/features/custom): Author your own template of any type. +- [Judge models](/docs/evaluation/concepts/judge-models): Pick the right model for an LLM-As-A-Judge or Agent eval. diff --git a/src/pages/docs/evaluation/concepts/judge-models.mdx b/src/pages/docs/evaluation/concepts/judge-models.mdx index 60eacd86..dd654a46 100644 --- a/src/pages/docs/evaluation/concepts/judge-models.mdx +++ b/src/pages/docs/evaluation/concepts/judge-models.mdx @@ -1,61 +1,83 @@ --- -title: "Judge Models: Choosing the Right Evaluation Scorer" -description: "Explains what a judge model is, how it scores AI responses, and how to choose the right judge model for your specific evaluation use case." +title: "Judge Models: Picking the Right Model for an Eval" +description: "What a judge model is, the Future AGI built-in models, when to bring your own, and how the model interacts with each eval type." --- ## About -A judge model is the model that reads each response and applies the eval template criteria to produce a result. When you run an evaluation, the judge receives the text to evaluate, the template's rule prompt, and the required inputs, then returns a result and a reason. +A judge model is the model that runs an evaluation. For an [Agent or LLM-As-A-Judge eval](/docs/evaluation/concepts/eval-types), the judge reads the inputs you mapped, applies the criteria, and returns a verdict and a reason. For a Code eval there is no judge model: the code is the judge. -The judge model determines how accurately and how quickly each response gets scored. Choosing the right one lets you balance precision and performance for your specific workload. +The judge model determines how accurate, how fast, and how expensive each row is to evaluate. Picking the right one is the main lever you have on the cost / accuracy trade-off for an eval. --- -## How a judge scores a response +## When this applies -1. The platform constructs a prompt from the eval template criteria and the row's input values. -2. The judge model receives this prompt and reads the response being evaluated. -3. The judge returns a result (pass/fail, score, or category) and a reason explaining the judgment. -4. The platform stores the result and reason for that row. +| Eval type | Picks a model | Notes | +|---|---|---| +| **Agents** | Yes | The model also drives the agent's reasoning across iterations. Higher [agent modes](/docs/evaluation/concepts/eval-types#agents) call the model more times. | +| **LLM-As-A-Judge** | Yes | One call per row. | +| **Code** | No | Pure code execution. | -The judge model does not generate or modify your AI's responses. It only reads and scores them. +For multimodal evals (image, audio), the model also has to support that modality. --- -## Available judge models +## Future AGI built-in models -Future AGI provides a set of proprietary models built specifically for evaluation: +Future AGI ships three judge models tuned for evaluation. They appear in a "Future AGI Models" section at the top of every model picker. -| Model | Code | Best for | Latency | -|---|---|---|---| -| TURING_LARGE | `turing_large` | Max accuracy, multimodal evals (text, image, audio) | Higher | -| TURING_SMALL | `turing_small` | High fidelity at lower cost (text, image) | Medium | -| TURING_FLASH | `turing_flash` | Fast, high-accuracy evals (text, image) | Low | -| PROTECT | `protect` | Safety, guardrails, user-defined rules (text, audio) | Low | -| PROTECT_FLASH | `protect_flash` | First-pass binary filtering (text only) | Ultra-low | +| Model | Code | What it's for | +|---|---|---| +| **Turing Large** | `turing_large` | Best accuracy for complex evaluations. Multimodal: text, image, audio. Default for most built-in templates. | +| **Turing Small** | `turing_small` | Balanced accuracy at lower cost. Multimodal: text, image. | +| **Turing Flash** | `turing_flash` | Fast, low-latency evaluations. Multimodal: text, image. Use for high-volume runs. | -See [Future AGI models](/docs/evaluation/features/futureagi-models) for full details on each model. +--- + +## Bring your own model + +Future AGI also lets you use your own model as the judge. Add it through workspace settings (`AI Providers` for direct integrations like OpenAI, Bedrock, Vertex, Azure; or a custom endpoint for any HTTP-accessible model). Once added, it appears in the eval model picker alongside the Future AGI models. -You can also bring your own model using the [custom models](/docs/evaluation/features/custom-models) integration. This is useful when you need a domain-specific fine-tuned model, want to keep inference in a specific cloud region, or already pay for a model you want to use as the judge. +This is useful when: + +- You need a domain-specific or fine-tuned model +- Compliance requires inference in a specific region or vendor +- You already pay for a model and want to reuse it as the judge + +See [Use custom models](/docs/evaluation/features/custom-models) for the setup steps and the providers supported. --- -## How to choose a judge +## How to choose | Situation | Recommended model | |---|---| -| Maximum accuracy matters more than speed | `turing_large` | -| High quality at reasonable cost | `turing_small` | -| Large-scale runs where speed is important | `turing_flash` | -| Safety and guardrail checks | `protect` or `protect_flash` | -| Evaluating images or audio | `turing_large` or `turing_small` | -| Domain-specific or compliance requirements | [Custom model](/docs/evaluation/features/custom-models) | +| Largest test of accuracy on a complex check | `turing_large` | +| Sensible default for most checks | `turing_large` for built-ins; `turing_small` for high-volume custom | +| High-volume runs where latency or cost matters | `turing_flash` | +| Image evals | `turing_large` or `turing_small` | +| Audio evals | `turing_large` | +| Compliance / fine-tuned needs | A custom model | + +If you are not sure, start with the model the template defaults to. Switch to a faster model once you've confirmed the eval gives the answers you expect; switch to a more accurate model if the verdicts disagree with your judgment too often. + +--- + +## Model and eval type interactions + +A few specifics to be aware of: + +- **Agent eval modes call the model more than once.** `Quick` mode is roughly one call per row; `Auto` and `Agent` can take several. Picking a faster model has more impact on cost in higher modes. +- **Few-shot examples on LLM-As-A-Judge add to the prompt.** The token cost scales with how many examples you include. +- **Code evals do not call any model.** They are unaffected by your model choice. --- -## Next steps +## Next Steps -- [Future AGI models](/docs/evaluation/features/futureagi-models): Full reference for built-in judge models. -- [Use custom models](/docs/evaluation/features/custom-models): Bring your own model as the judge. -- [Eval templates](/docs/evaluation/concepts/eval-templates): The criteria the judge applies. +- [Future AGI models](/docs/evaluation/features/futureagi-models): Full reference for the built-in judge models. +- [Use custom models](/docs/evaluation/features/custom-models): Bring your own model. +- [Eval types](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. +- [Eval templates](/docs/evaluation/concepts/eval-templates): Where the model setting lives on a template. - [Eval results](/docs/evaluation/concepts/eval-results): What the judge produces after scoring. diff --git a/src/pages/docs/evaluation/concepts/output-types.mdx b/src/pages/docs/evaluation/concepts/output-types.mdx new file mode 100644 index 00000000..fb448a21 --- /dev/null +++ b/src/pages/docs/evaluation/concepts/output-types.mdx @@ -0,0 +1,159 @@ +--- +title: "Output Types: Pass/fail, Scoring, and Choices" +description: "How to pick an output type when creating an eval, what each one returns, and how labels map to numeric scores so results stay consistent across runs." +--- + +## About + +Every eval template has one output type. The output type determines what shape the verdict takes, what you have to define on the template, and how results aggregate. There are three: **Pass/fail**, **Scoring**, and **Choices**. + +The output type is set on the template, not at run time. Picking the right one is the second decision after [eval type](/docs/evaluation/concepts/eval-types). + +--- + +## At a glance + +| Output type | UI label | What you define | What a row returns | +|---|---|---|---| +| `pass_fail` | **Pass/fail** | Nothing extra | `"Passed"` or `"Failed"` | +| `percentage` | **Scoring** | Labels with numeric scores 0-1 (in 0.1 increments), plus a pass threshold | `{ "choice": "Good", "score": 0.7 }` | +| `deterministic` | **Choices** | Labels each marked Pass / Neutral / Fail, plus optional multi-choice | `{ "choice": "Formal", "score": 1.0 }` (or a list when multi-choice) | + +--- + +## Pass/fail + +The simplest output. The eval returns `"Passed"` or `"Failed"` and a reason. + +**You define:** +Nothing on the template beyond picking this output type. + +**The eval returns:** +``` +Value: "Passed" +Reason: "The response stays grounded in the provided sources." +``` + +**Aggregates as:** +A pass rate across rows. + +**Use it for:** +Binary verdicts. Toxicity, PII detection, format validation, "did the agent fulfil the task or not." + +--- + +## Scoring + +The judge picks one of your labels and the platform maps that label to a numeric score 0-1. Both are returned. + +**You define:** + +| Setting | What it is | +|---|---| +| **Choice labels** | A small set of named levels (e.g. `Excellent`, `Good`, `Average`, `Poor`). | +| **Score per label** | A numeric value 0-1 for each label (e.g. `Excellent: 1.0`, `Average: 0.5`, `Poor: 0.0`). | +| **Pass threshold** | A score 0-1. Rows with `score >= threshold` count as passing. Default 0.5. | + +**The eval returns:** +``` +Value: { "choice": "Good", "score": 0.7 } +Reason: "The response covers the main points and cites the source for dates." +``` + +**Aggregates as:** +The average score across rows; pass rate against the threshold; distribution across labels. + +**Use it for:** +Graded quality where you want both a numeric score (for averaging and trend lines) and a human-readable level (for reading individual rows). Groundedness, completeness, instruction adherence, summary quality. + + +The model picks a label, not a raw number. This is intentional: defined levels stay consistent across runs and across rows in a way that asking for an arbitrary 0-1 score does not. If you want continuous numeric output (a metric like BLEU, ROUGE, or embedding similarity), use a [Code eval](/docs/evaluation/concepts/eval-types#code) instead. + + +### Pass threshold + +The threshold is how Scoring rows turn into pass / fail counts for aggregation. With these labels: + +``` +Excellent: 1.0 +Good: 0.7 +Average: 0.5 +Poor: 0.0 +``` + +a threshold of `0.5` makes `Excellent`, `Good`, and `Average` rows count as passing; `Poor` rows count as failing. A threshold of `0.7` would count only `Excellent` and `Good` as passing. A threshold of `0.8` would count only `Excellent` as passing. + +Pick the threshold based on what counts as "good enough" in your context. Stricter thresholds give you a tighter pass rate but flag more rows for review. + + +Choice scores and the pass threshold are configured with sliders in the UI. Both use `0.1` increments, so valid values are `0.0`, `0.1`, ..., `1.0`. + + +--- + +## Choices + +The judge picks one or more labels from a fixed set. Each label is marked Pass, Neutral, or Fail when you create the template, which determines whether the row passes. + +**You define:** + +| Setting | What it is | +|---|---| +| **Choice labels** | The set of categories the judge can pick from. | +| **Verdict per label** | Each label is marked **Pass**, **Neutral**, or **Fail**. | +| **Multi-choice** | Toggle. When on, the judge can pick more than one label per row. | + +**The eval returns (single-choice):** +``` +Value: { "choice": "Formal", "score": 1.0 } +Reason: "The response uses complete sentences and avoids contractions." +``` + +**The eval returns (multi-choice):** +``` +Value: { "choice": ["Friendly", "Concise"], "score": 1.0 } +Reason: "The response is short and uses warm phrasing." +``` + +The numeric `score` reflects the verdict mark on the chosen label (Pass = 1.0, Neutral = 0.5, Fail = 0.0). When multi-choice is on, the score is computed across the chosen labels. + +**Aggregates as:** +Distribution across labels; pass rate (rows where the chosen label is marked Pass). + +**Use it for:** +Classification. Tone (`Formal` / `Casual` / `Neutral`), intent detection (`Question` / `Complaint` / `Compliment`), language identification, multi-label tagging. + +--- + +## How to pick + +Use this as a default rule: + +1. **Is the answer yes or no?** Use **Pass/fail**. +2. **Is the answer "how good?" with a few defined levels?** Use **Scoring**. +3. **Is the answer "which category?" from a fixed set?** Use **Choices**. + +Don't reach for Scoring just because you want a number. Pass/fail aggregates as a percentage (which is a number). Reach for Scoring when you genuinely have several quality levels you want to distinguish. + +--- + +## Output type by eval type + +All three output types work with all three [eval types](/docs/evaluation/concepts/eval-types), with one nuance for Code: + +| Eval type | Pass/fail | Scoring | Choices | +|---|---|---|---| +| **Agents** | Yes | Yes | Yes | +| **LLM-As-A-Judge** | Yes | Yes | Yes | +| **Code** | Returning `True` / `False` | Returning a number 0-1 | Not applicable; pick one of the other two | + +For Code evals, the return value of your function determines the verdict directly. The platform doesn't ask the judge to pick a label. + +--- + +## Next Steps + +- [Eval types](/docs/evaluation/concepts/eval-types): Pick the type before the output type. +- [Eval templates](/docs/evaluation/concepts/eval-templates): Where the output type is set. +- [Eval results](/docs/evaluation/concepts/eval-results): What the result looks like once an eval runs. +- [Create custom evals](/docs/evaluation/features/custom): Author a template and configure its output type. diff --git a/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx b/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx index c09f74ce..7b8f40d0 100644 --- a/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx +++ b/src/pages/docs/evaluation/concepts/understanding-evaluation.mdx @@ -1,58 +1,65 @@ --- -title: "Understanding AI Evaluation in Future AGI Platform" -description: "Covers how evaluation works in Future AGI: templates, judge models, eval types, results, and where evaluations run in the platform and SDK." +title: "Understanding Evaluation in Future AGI" +description: "How evaluation works in Future AGI: templates, types, output, and where you can run them across datasets, traces, simulations, experiments, and the SDK." --- ## About -Evaluation in Future AGI is a systematic way to measure whether your AI is producing the right outputs. You define what "right" means once, using an eval template, and the platform scores every response automatically against that definition, returning a result and a reason for each one. +Evaluation is how you check whether your AI is doing the right thing. You define a check once, the platform runs it against your data, and every row gets a verdict and a reason. You can review individual rows, watch the aggregate over time, or fail a CI run when scores drop. -Every eval run has three components working together: a **template** that defines the criteria, a **judge model** that applies the criteria to each response, and a **result** that records the outcome. You supply the data; the platform handles the scoring. +Three pieces work together: an [eval template](/docs/evaluation/concepts/eval-templates) that defines what to measure, an [eval type](/docs/evaluation/concepts/eval-types) that determines how it runs (Agents, LLM-As-A-Judge, or Code), and a [result](/docs/evaluation/concepts/eval-results) that records the outcome. Pick a template, map your data, run. --- ## How it works -1. **Choose a template**: Select a built-in template (e.g. Toxicity, Groundedness, Tone) or create a custom one with your own rule prompt. Templates define what to measure and what output type to expect (pass/fail, score, or a category). +1. **Pick or create a template.** Choose from 70+ built-in templates (Toxicity, Groundedness, Tone, Task Completion, BLEU, ROUGE, ...) or create a custom one. The template's [type](/docs/evaluation/concepts/eval-types) determines what it can do at run time: + - **Agents** can call tools and MCP connectors, search a knowledge base, use the internet, and reason over multiple steps. + - **LLM-As-A-Judge** runs one templated prompt against a model. + - **Code** runs Python or JavaScript with no model call. -2. **Map your data**: Tell the eval which columns or input keys contain the text to evaluate (e.g. which column is the model response, which is the reference context). +2. **Map your data.** Each template declares the input variables it expects, written as `{{variable_name}}` in the criteria. You map your actual data to those variables. On a dataset, you pick columns. On a trace project, you pick span attributes. The template stays the same; only the mapping changes. -3. **Pick a judge model**: Choose a Future AGI model (e.g. `turing_flash`) or bring your own via a custom model integration. The judge reads each row and applies the template criteria. +3. **Configure run settings.** When applying the template you can override the model, change agent settings (mode, tools, knowledge bases, internet access, what context to inject), turn on error localization, and decide whether to add a reason column. -4. **Run**: The platform processes every row in parallel. Each row gets a result (pass/fail or a score) and a reason explaining the judgment. +4. **Run.** Future AGI processes every row in parallel. Each row gets a value, a reason, runtime, and the model that produced the verdict. -5. **Review**: Results appear as new columns in your dataset, or as aggregated summaries and KPIs across runs. +5. **Review.** Results show as new columns in the dataset, eval logs on traces, scores on simulation calls, or returned values from the SDK. Aggregates (pass rate, average score, distributions) appear automatically. --- ## Where evals run -Evals are not limited to datasets. The same templates work across every surface in Future AGI: +The same templates work across every surface in Future AGI. Pick the surface that matches what you want to evaluate. -| Surface | What you evaluate | -|---|---| -| **Dataset** | Score every row in a dataset against one or more templates | -| **Simulation** | Evaluate agent responses in a run test against your criteria | -| **Experiments** | Compare prompt or model versions using the same eval criteria | -| **CI/CD pipeline** | Run evals automatically on every code change and track results by version | -| **SDK** | Call `evaluator.evaluate()` from any script or application | +| Surface | What you evaluate | When to use | +|---|---|---| +| **Dataset** | Every row in a structured dataset | Offline evaluation, batch quality checks, regression suites | +| **Eval Playground** | A single ad-hoc input you type or paste | Quick sanity check while authoring a template | +| **Trace project** | Spans, traces, or sessions captured from your AI app | Live and historical evaluation of production traffic | +| **Simulation** | Calls produced by a simulated agent run | Pre-production testing of agents and prompts | +| **Experiment** | Variant outputs side by side (different prompts or models) | A/B comparison of prompt or model changes | +| **CI/CD pipeline** | Eval pass rates per code version | Gate deploys on quality | +| **SDK** | Anything you can pass to a Python or TypeScript function | Integrate evaluation into your own scripts and pipelines | -Using the same template across surfaces keeps results directly comparable without redefining criteria each time. +Using the same template across surfaces keeps results directly comparable without redefining your quality criteria each time. --- ## Key concepts -- **[Eval templates](/docs/evaluation/concepts/eval-templates)**: The definition of what to measure. Built-in or custom. -- **[Judge models](/docs/evaluation/concepts/judge-models)**: The model that applies the template criteria and produces the result. -- **[Eval results](/docs/evaluation/concepts/eval-results)**: The output of a run: result value, reason, and aggregates. -- **[Eval groups](/docs/evaluation)**: Named collections of templates you run together as a single unit. +- [**Eval types**](/docs/evaluation/concepts/eval-types): Agents, LLM-As-A-Judge, and Code. The type determines what a template can do. +- [**Eval templates**](/docs/evaluation/concepts/eval-templates): The shareable definition of a check. Built-in or custom, single or composite, versioned. +- [**Output types**](/docs/evaluation/concepts/output-types): Pass/fail, Scoring, and Choices. What the eval returns. +- [**Judge models**](/docs/evaluation/concepts/judge-models): Which model judges, when applicable. +- [**Data injection**](/docs/evaluation/concepts/data-injection): What context an eval gets in addition to your `{{variables}}`. +- [**Eval results**](/docs/evaluation/concepts/eval-results): The format of the verdict, reason, and aggregates. --- -## Next steps +## Next Steps - [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate): Run your first eval. -- [Built-in evals](/docs/evaluation/builtin): 70+ templates across quality, safety, factuality, RAG, and more. -- [Create custom evals](/docs/evaluation/features/custom): Define your own criteria and rule prompts. -- [Eval groups](/docs/evaluation): Bundle multiple evals and run them in one pass. +- [Built-in evals](/docs/evaluation/builtin): 70+ templates across quality, safety, factuality, RAG, audio, and image. +- [Create custom evals](/docs/evaluation/features/custom): Define your own criteria in any of the three types. +- [Test playground](/docs/evaluation/features/test-playground): Try an eval against a row, span, simulation, or custom input before committing it to a dataset. diff --git a/src/pages/docs/evaluation/concepts/versioning.mdx b/src/pages/docs/evaluation/concepts/versioning.mdx new file mode 100644 index 00000000..dd0d6636 --- /dev/null +++ b/src/pages/docs/evaluation/concepts/versioning.mdx @@ -0,0 +1,134 @@ +--- +title: "Versioning: Changing a Template Without Breaking Running Evals" +description: "How eval template versions work in Future AGI, what each version stores, and how to use Set as Default and Restore Version to manage changes safely." +--- + +## About + +Eval templates are versioned. Every save creates a new immutable snapshot, and one of those snapshots is the **default** version that new uses pick up. Existing dataset evals, trace evals, simulation evals, and SDK calls keep using whichever version they pinned to. + +This means you can edit a template's criteria, model, or settings without breaking anything that's already running. The change rolls out to new uses; the old uses stay on the version they started on. + +--- + +## What a version captures + +Each version is a snapshot of the template's settings at the time of save: + +| Saved in the version | +|---| +| Criteria (rule prompt or code) | +| Model | +| Output type and choice scores | +| Pass threshold | +| Required keys / variables | +| Agent settings (mode, tools, knowledge bases, internet, context) | +| Few-shot examples reference | + +Versions are immutable. You can't edit a version after it's saved; you can only create a new one. + +--- + +## How versions are created + +A new version is created automatically every time you save the template. There's no separate "create version" step in normal use: edit the template, save, and a snapshot is recorded. + +You'll see versions listed as `V1`, `V2`, `V3`, ... in the version selector at the top of the eval detail page. The current default is marked. + +--- + +## Default version + +Exactly one version is the default. When someone: + +- Adds the eval to a new dataset +- Adds the eval to a new trace project +- Calls the eval from the SDK without pinning a version +- Includes the eval as a child of a new composite + +they get the default version. + +To change which version is default, open the version menu and pick **Set as Default**. The change applies only to **future** uses; existing pins stay on whichever version they were on. + +--- + +## Restore Version + +**Restore Version** copies a previous version's settings back into the current template. It does **not** reset the version history. What it does instead: + +1. Reads the settings from the version you picked +2. Applies them to the current template +3. Saves, which creates a **new** version (so V1, V2, V3 → V1, V2, V3, V4 with V4 = V2's content) + +This keeps the history complete: you can always look back at what V3 was, even after restoring. + +Use **Restore Version** when: + +- You shipped a change in V3 that produces worse verdicts than V2 +- You want to roll back without losing the V3 attempt from history +- You want to fork: restore an old version, edit, save as the new default + +--- + +## Pinning a specific version + +When you apply an eval to a dataset, trace project, or simulation, the platform records which version was active at that moment. From then on, that use is pinned to that version. New default versions don't change anything for existing uses. + +This is the core safety property. Edit a template freely; existing evaluations don't change unless you explicitly re-pin them. + +To re-pin an existing use to the new default, remove and re-add the eval, or use the version selector inside the eval picker if you want to keep the binding and just bump the version. + +--- + +## Versions in composites + +When a composite eval includes a child template, you can pin the child to a specific version. The composite uses that pinned version every time, even if the child's default version changes later. + +This is the right default for production composites: lock the children, change them only when you intend to. + +--- + +## Common patterns + +### Edit, run, compare + +1. Edit the template and save (creates V2). +2. Apply the new default to a small dataset and check the verdicts. +3. If the new version looks better, keep it as default. +4. If it looks worse, **Restore Version** back to V1 (creates V3 = V1's settings). + +### Promote without breaking running evals + +1. Edit the template and save (creates V2). Existing dataset evals stay on V1. +2. Verify V2 with the playground or a fresh dataset. +3. **Set as Default** if you're happy. New uses get V2; old uses stay on V1 until re-pinned. + +### Hotfix on production + +1. Notice that V3 (current default) is producing wrong verdicts. +2. **Set as Default** on V2 to roll back the default. +3. Investigate V3's problem and fix it in V4. + +In all three patterns, history stays complete and existing uses aren't disrupted unless you explicitly re-pin. + +--- + +## What's not versioned + +Versions only cover the template definition. They do **not** cover: + +- The mapping between your data and the template's variables (that's per-use, stored on the binding, not the template) +- Run-time overrides like model or context (also per-use) +- The dataset, trace project, or simulation the eval is applied to +- Eval results (those are stored where the eval ran, not on the template) + +If you change a mapping, you don't need a new version of the template. If you change the criteria, you do. + +--- + +## Next Steps + +- [Eval templates](/docs/evaluation/concepts/eval-templates): The shape of a template and what versioning preserves. +- [Composite evals](/docs/evaluation/concepts/composite-evals): Pinning child versions inside a composite. +- [Eval results](/docs/evaluation/concepts/eval-results): Where results live and why they're not affected by template versioning. +- [Create custom evals](/docs/evaluation/features/custom): Save creates the first version automatically. diff --git a/src/pages/docs/evaluation/features/custom-models.mdx b/src/pages/docs/evaluation/features/custom-models.mdx index d9d8a4a5..6562e65f 100644 --- a/src/pages/docs/evaluation/features/custom-models.mdx +++ b/src/pages/docs/evaluation/features/custom-models.mdx @@ -9,7 +9,7 @@ Evaluations need a model to act as the judge: to read each response and decide w This matters when you have a model that knows your domain better, when you need inference to stay within a specific cloud provider or region, or when you want to track evaluation costs against a model you already pay for. -Once you add a custom model, it appears in the model dropdown everywhere evaluations are configured : datasets, simulations, custom evals, and eval groups. +Once you add a custom model, it appears in the model dropdown everywhere evaluations are configured: datasets, simulations, custom evals, and trace projects. Two ways to connect: diff --git a/src/pages/docs/evaluation/features/custom.mdx b/src/pages/docs/evaluation/features/custom.mdx index 66c648b6..f25169ad 100644 --- a/src/pages/docs/evaluation/features/custom.mdx +++ b/src/pages/docs/evaluation/features/custom.mdx @@ -1,149 +1,385 @@ --- -title: "Create Custom Evaluation Templates in Future AGI" -description: "Define custom evaluation criteria and rules tailored to your use case, extending beyond the built-in templates available in Future AGI." +title: "Create Custom Evals" +description: "Create your own eval templates in Future AGI as Agents, LLM-As-A-Judge, or Code, including all configuration options shown in the UI." --- ## About -Every AI product has its own definition of a good response. Custom evals let you encode those rules and run them at scale, you write the criteria once, in plain language, and Future AGI scores every response against it automatically, returning a result and a reason for each one. +Custom evals let you encode your own quality, safety, or business rules and run them at scale. You write the criteria once and the platform runs it on every row, span, simulation call, or input. -Once created, a custom eval works exactly like any built-in template: use it on a dataset, in a simulation, or call it from the SDK. +A custom eval is one of three types: **Agents**, **LLM-As-A-Judge**, or **Code**. Pick the one that matches what you need before you start. See [Eval types](/docs/evaluation/concepts/eval-types) for help choosing. + +Once a template is saved, use it the same way as any built-in: apply it to a dataset, attach it to a trace project, run it from the SDK. --- -## When to use +## Prerequisites -- **Domain-specific validation**: Assess content against industry or regulatory standards that aren’t in the default templates. -- **Business rule compliance**: Enforce your organization’s guidelines (tone, format, disclosures) in a repeatable eval. -- **Complex or weighted scoring**: Implement multi-criteria or custom scoring logic via your rule prompt. -- **Custom output formats**: Validate specific response structures or formats (e.g. JSON shape, required fields) with a tailored eval. +- A Future AGI workspace +- A Future AGI API key (Settings → API Keys) for SDK or API use --- -## How to - -You can create custom evals from the **UI** or via the **SDK** (by calling the REST API from your code). After the template is saved, run it from the UI or from the evaluation SDK using the template name. - - - - - - Open your **dataset**, click **Evaluate** in the top-right, then **Add Evaluation**. Select **Create your own eval** to start the custom-eval flow. - ![Open evaluation creation](/screenshot/product/evaluation/create-custom-evals/1.png) - - - **Name**: unique name for the eval (lowercase letters, numbers, hyphens, and underscores only; cannot start or end with `-` or `_`). Used when you add the eval to a dataset or call it from the SDK. - - **Model**: choose **Use Future AGI Models** (e.g. turing_large, turing_flash, turing_small, protect, protect_flash) or **Use other LLMs** (your own or external providers). For model details, see [Future AGI models](/docs/evaluation/features/futureagi-models) and [Use custom models](/docs/evaluation/features/custom-models). - ![Configure basic settings](/screenshot/product/evaluation/create-custom-evals/2.png) - - - In **Rule prompt** (criteria), write the instructions the model will follow to evaluate each row. Use **`{{variable_name}}`** for placeholders; you'll map these to dataset columns (or SDK input keys) when you add or run the eval. Be specific about what counts as pass/fail or how to score. - ![Define evaluation rules](/screenshot/product/evaluation/create-custom-evals/3.png) - - - **Pass/Fail**: binary result (e.g. 1.0 pass, 0.0 fail). **Percentage (score)**: numeric score between 0 and 100. **Deterministic choices**: categorical result; provide a dict of allowed choices. - ![Configure output type](/screenshot/product/evaluation/create-custom-evals/4.png) - - - - **Tags**: for filtering and organization. - - **Description**: shown in the evaluation list. - - **Check Internet**: allow the eval to fetch up-to-date information when needed. - - **Required keys**: list the input variable names the eval expects (e.g. `input`, `output`, `user_query`, `chatbot_response`). - - ![Add optional settings and save](/screenshot/product/evaluation/create-custom-evals/5.png) - - - Click **Create Evaluation**. The new template appears in your evaluation list and can be added to any dataset or called via the SDK using the name you gave it. - - - In your dataset, click **Evaluate** → **Add Evaluation**, select the custom eval you created, map the **columns** to the rule-prompt variables, then click **Add & Run**. See [Running your first eval](/docs/evaluation) for the full UI flow. - - - - - Creating a custom eval template requires a POST to the Future AGI API. Once created, run it using the `Evaluator` from the `ai-evaluation` SDK. - - - - ```bash - pip install ai-evaluation - ``` - - - Send a POST to `/model-hub/create_custom_evals/` using your `FI_API_KEY` and `FI_SECRET_KEY` as headers. - - ```python - import requests - - response = requests.post( - "https://api.futureagi.com/model-hub/create_custom_evals/", - headers={ - "X-Api-Key": "your-fi-api-key", - "X-Secret-Key": "your-fi-secret-key", - }, - json={ - "name": "chatbot_politeness_and_relevance", - "description": "Evaluates if the response is polite and relevant.", - "criteria": "Evaluate: 1) Politeness. 2) Relevance to: {{user_query}}. Response: {{chatbot_response}}. Pass only if both.", - "output_type": "Pass/Fail", - "required_keys": ["user_query", "chatbot_response"], - "config": {"model": "turing_small"}, - "check_internet": False, - "tags": ["customer-service"], - }, - ) - print(response.json()) # {"eval_template_id": "..."} - ``` - - - Use the template **name** you registered with `Evaluator.evaluate()`: - - ```python - from fi.evals import Evaluator - - evaluator = Evaluator( - fi_api_key="your-fi-api-key", - fi_secret_key="your-fi-secret-key", - ) - - result = evaluator.evaluate( - eval_templates="chatbot_politeness_and_relevance", - inputs={ - "user_query": "What is the return policy?", - "chatbot_response": "Our return policy allows returns within 30 days.", - }, - ) - - print(result.eval_results[0].output) - print(result.eval_results[0].reason) - ``` - - - +## Create from the UI + + + + + +Navigate to **Evaluations** in the sidebar, then click **Create Evaluation**. + + + + + +**Name:** Lowercase letters, numbers, hyphens, and underscores only. Cannot start or end with `-` or `_`. The name is how the eval is referenced from the SDK and the apply flow. + +**Type:** Pick one of three tabs at the top: + +| Tab | When to pick it | +|---|---| +| **Agents** | The eval needs to call tools, search a knowledge base, use the internet, or reason over a trace. | +| **LLM-As-A-Judge** | One prompt is enough. The model decides from the inputs you map. | +| **Code** | The check is computable in Python or JavaScript without a model call. | + +The fields below depend on which type you pick. + + + + + + + + + +Write the **Instructions** that the agent should follow when judging each row. Use `{{variable_name}}` for any data that gets filled in at run time. The variables you reference become the eval's required keys. + +``` +You are evaluating whether the response stays factually grounded in the provided source. + +Source: {{context}} +Response: {{output}} + +Verdict: pass if every claim in the response is supported by the source, otherwise fail. +``` + +Then configure the agent's run settings: + +| Setting | What it does | +|---|---| +| **Model** | The judge model. Defaults to `turing_large`. See [Judge models](/docs/evaluation/concepts/judge-models). | +| **Mode** | `Quick` (single pass), `Auto` (balanced), `Agent` (deeper reasoning). Higher modes use more tokens for higher confidence. | +| **Use Internet** | Allow the eval to fetch up-to-date information during the run. | +| **Connectors** | MCP connectors the eval can call as tools. | +| **Knowledge Bases** | Knowledge bases the eval can search. | +| **Context** | What the eval sees beyond your `{{variables}}`. See [Data injection](/docs/evaluation/concepts/data-injection) for the six options. | +| **Summary** | Whether the reason is `Short`, `Long`, `Concise`, or follows a custom rubric. | +| **Template format** | `Mustache` (default `{{variable}}`) or `Jinja` if you need control flow. | + +![Agents tab populated with Instructions, Model, Mode, Use Internet, Connectors, Knowledge Bases, Context, and Summary settings](/images/docs/evaluation/custom/agents-tab.png) + + + + + +Write the **Messages** the judge sees. The first message is typically a system message describing the rubric; the user message contains the data wrapped in `{{variable_name}}` placeholders. + +``` +[ + { + "role": "system", + "content": "You are evaluating tone. Decide whether the response is professional." + }, + { + "role": "user", + "content": "Response: {{output}}" + } +] +``` + +Then configure: + +| Setting | What it does | +|---|---| +| **Model** | The judge model. See [Judge models](/docs/evaluation/concepts/judge-models). | +| **Few-shot examples** | Optional dataset of labelled examples. Shows the judge the kind of verdict you want. | +| **Template format** | `Mustache` (default) or `Jinja`. | + +![LLM-As-A-Judge tab with the message editor, model picker, and few-shot examples picker](/images/docs/evaluation/custom/llm-judge-tab.png) + + + + + +Pick a language and write a function that judges the row. The platform calls an `evaluate` function with the variables you mapped. + +```python +from typing import Any + +# Return: bool, float (0-1), dict with score+reason, or None to skip +def evaluate( + input: Any, # Input to the AI system + output: Any, # AI system's output + expected: Any, # Ground truth (may be None) + context: dict, # Mode-specific data (see below) + **kwargs # Additional mapped variables +): + """ + context keys by mode: + Dataset: context["row"], context["dataset_name"] + Tracing: context["span"], context["trace"], context["spans"], context["session"] + Simulation: context["transcript"], context["call_metrics"] + """ + if expected is None: + return None + + return { + "score": 1.0 if output == expected else 0.0, + "reason": "Exact match check", + } +``` + +| Setting | What it does | +|---|---| +| **Language** | `Python` or `JavaScript`. | +| **Code** | The function body. The variables you reference (`input`, `output`, `expected`, `context`, anything else) become the eval's required keys. | +| **Return value** | `True` / `False` for Pass/fail, a `float` between 0 and 1 for Scoring, a `dict` with `score` and `reason` for richer output, or `None` to skip the row. | + + + + + + + +Pick what the eval returns. See [Output types](/docs/evaluation/concepts/output-types) for full details. + +| Output type | UI label | What you configure | +|---|---|---| +| `pass_fail` | **Pass/fail** | Nothing extra. | +| `percentage` | **Scoring** | Choice labels with scores 0-1, plus a pass threshold. | +| `deterministic` | **Choices** | Choice labels each marked Pass / Neutral / Fail. Optional multi-choice toggle. | + +![Output type radio group with Scoring selected, showing choice-label rows with score sliders](/images/docs/evaluation/custom/output-type-scoring.png) + + + + + +**Error Localization** is a toggle right below the output type. When on, the eval analyses why a row failed and surfaces the offending field. Available for Agent and LLM-As-A-Judge evals. See [Error Localization](/docs/evaluation/features/error-localization). + +The **Advanced** section is collapsible. Open it to set: + +| Setting | What it does | +|---|---| +| **Description** | Free-form description shown in the eval list. | +| **Tags** | Categories like `safety`, `nlp_metrics`, `audio`, `agents`. Used for filtering. | + + + + + +Click **Test Evaluation** to run the eval against a sample input without saving. The test panel lets you fill in the variables manually or pull a row from a dataset, span, or simulation. See [Test Playground](/docs/evaluation/features/test-playground). + +The Test Evaluation button is the right way to validate the criteria before committing the template. Nothing is persisted by Test. + + + + + +Click **Save**. The template is created and the first version (V1) is recorded. The eval appears in the evaluation list. + +To use it: apply it to a dataset, attach it to a trace project, run it from the SDK, or include it in a composite. See [Evaluate via Platform & SDK](/docs/evaluation/features/evaluate). + + + + + +--- + +## Create from the SDK + +The SDK calls the same API the UI uses. Use this when you want to define evals in code, version them with your application, or generate them from configuration. + + + + + +```bash +pip install ai-evaluation +``` + + + + + + + +```python title="Python" +import requests + +response = requests.post( + "https://api.futureagi.com/model-hub/eval-templates/create-v2/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, + json={ + "name": "response_groundedness", + "description": "Checks the response is supported by the source.", + "eval_type": "llm", + "instructions": ( + "You are evaluating whether the response is supported by the source.\n" + "Source: {{context}}\n" + "Response: {{output}}\n" + "Pass if every claim is grounded in the source." + ), + "model": "turing_large", + "output_type": "pass_fail", + "pass_threshold": 0.5, + "tags": ["rag", "factuality"], + }, +) +print(response.json()) # {"eval_template_id": "..."} +``` + +```typescript title="TypeScript" +const response = await fetch( + "https://api.futureagi.com/model-hub/eval-templates/create-v2/", + { + method: "POST", + headers: { + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + "Content-Type": "application/json", + }, + body: JSON.stringify({ + name: "response_groundedness", + description: "Checks the response is supported by the source.", + eval_type: "llm", + instructions: + "You are evaluating whether the response is supported by the source.\n" + + "Source: {{context}}\n" + + "Response: {{output}}\n" + + "Pass if every claim is grounded in the source.", + model: "turing_large", + output_type: "pass_fail", + pass_threshold: 0.5, + tags: ["rag", "factuality"], + }), + }, +); + +console.log(await response.json()); // { eval_template_id: "..." } +``` + +```bash title="cURL" +curl -X POST https://api.futureagi.com/model-hub/eval-templates/create-v2/ \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "response_groundedness", + "description": "Checks the response is supported by the source.", + "eval_type": "llm", + "instructions": "Source: {{context}}\nResponse: {{output}}\nPass if every claim is grounded.", + "model": "turing_large", + "output_type": "pass_fail", + "pass_threshold": 0.5, + "tags": ["rag", "factuality"] + }' +``` + + + + + + + +Once created, run the eval by referencing its `name`: + +```python +from fi.evals import Evaluator + +evaluator = Evaluator( + fi_api_key="YOUR_API_KEY", + fi_secret_key="YOUR_SECRET_KEY", +) + +result = evaluator.evaluate( + eval_templates="response_groundedness", + inputs={ + "context": "Paris is the capital of France.", + "output": "The capital of France is Paris.", + }, +) + +print(result.eval_results[0].output) # "Passed" +print(result.eval_results[0].reason) # explanation +``` + + + + + +--- + +## Field reference + +Every field accepted by the create endpoint, mapped to its UI label. + +| Field | Type | UI label | Notes | +|---|---|---|---| +| `name` | string | **Name** | Required for non-drafts. Lowercase, numbers, `-`, `_`. | +| `eval_type` | `"agent" \| "llm" \| "code"` | Type tab | Default `"llm"`. | +| `instructions` | string | **Instructions** (Agent), **Messages** content (LLM) | Up to 100k chars. Use `{{variable}}` placeholders. | +| `code` | string | **Code** editor | For `eval_type: "code"`. | +| `code_language` | `"python" \| "javascript"` | **Language** | For `eval_type: "code"`. | +| `messages` | array | **Messages** | For LLM-As-A-Judge. List of `{role, content}`. | +| `few_shot_examples` | array | **Few-shot examples** | For LLM-As-A-Judge. | +| `model` | string | **Model** | Default `"turing_large"`. | +| `mode` | `"auto" \| "agent" \| "quick"` | **Mode** | Agent only. | +| `tools` | object | **Connectors / Use Internet** | `{internet: bool, connectors: [...]}`. | +| `knowledge_bases` | string[] | **Knowledge Bases** | Agent only. | +| `data_injection` | object | **Context** | Agent only. See [Data injection](/docs/evaluation/concepts/data-injection). | +| `summary` | object | **Summary** | Agent only. `{type: "short" \| "long" \| "concise" \| "custom", custom_instructions: ""}`. | +| `output_type` | `"pass_fail" \| "percentage" \| "deterministic"` | **Output type** | See [Output types](/docs/evaluation/concepts/output-types). | +| `pass_threshold` | float | **Pass threshold** | 0.0-1.0, default 0.5. | +| `choice_scores` | object | **Choice labels** | `{"Excellent": 1.0, "Good": 0.7, ...}`. Scores are 0-1 in 0.1 increments. | +| `description` | string | **Description** | Optional. | +| `tags` | string[] | **Tags** | Optional. | +| `check_internet` | bool | **Use Internet** | Agent only (alternative to setting it inside `tools`). | +| `error_localizer_enabled` | bool | **Error Localization** | Agent and LLM only. | +| `is_draft` | bool | (internal) | Skips name validation. Used by the FE for autosave. | + +--- + +## What gets saved + +When you save: + +1. The template is created with `owner: "user"` and your workspace. +2. Version `V1` is captured as an immutable snapshot of the settings. +3. The template appears in the evaluation list, scoped to your workspace. + +Editing the template later creates `V2`, `V3`, ... with `V1` preserved as history. See [Versioning](/docs/evaluation/concepts/versioning). + --- ## Next Steps - - Run evals from the UI or SDK. - - - Add your custom eval to a group and run it with others. - - - Bring your own model for evaluations. + + Try your eval against a sample input before applying it. - - Built-in models available for evals. + + Apply the eval to a dataset, trace project, simulation, or SDK call. - - Run evals automatically in your pipeline. + + Bundle this eval with others into a single composite check. - - How evaluation fits into the platform. + + Pick the right type for your use case. diff --git a/src/pages/docs/evaluation/features/error-localization.mdx b/src/pages/docs/evaluation/features/error-localization.mdx new file mode 100644 index 00000000..aa4f99c9 --- /dev/null +++ b/src/pages/docs/evaluation/features/error-localization.mdx @@ -0,0 +1,172 @@ +--- +title: "Error Localization" +description: "Pinpoint which input field caused a row to fail an eval. Available on Agent and LLM-As-A-Judge evals; surfaces a flagged field and an explanation per failed row." +--- + +## About + +When a row fails an eval, you want to know **why**: which part of the input caused the failure, not just that the verdict was Failed. Error Localization is the feature that answers that question. When it's on, the platform analyses each failed row, picks the input field most likely to be the cause, and surfaces it alongside the failure reason. + +This makes failure review faster: instead of reading the full reason and guessing where the problem is, you see the offending field flagged directly. + +Error Localization is available for **Agent** and **LLM-As-A-Judge** evals. It does not apply to Code evals (there's no model trace for the localizer to introspect). + +--- + +## When to use + +- A dataset has many input columns and you want to know which one is breaking the eval +- An agent response failed groundedness and you want to know whether the response is wrong, the source is wrong, or the question is wrong +- You're debugging a custom eval and want extra signal beyond the verdict + +It's most useful on multi-input evals like `groundedness` (`output` + `context`), `task_completion` (`task` + `output`), or any custom eval with several `{{variables}}`. Single-input evals like `toxicity` (`output` only) gain less since there's nothing to localize against. + +--- + +## What you get + +When Error Localization is on, each failed row produces: + +| Field | What it is | +|---|---| +| **Selected input key** | The variable the localizer thinks is the cause (e.g. `context`, `output`, `task`). | +| **Error analysis** | A short description of what looks wrong with that field. | +| **Standard reason** | The eval's normal explanation, unchanged. | + +The selected key is highlighted in the result UI so you can spot it at a glance. + +--- + +## Turn it on + +You can enable Error Localization in two places. + +### On the template + +Toggle **Error Localization** when creating or editing the eval. Every application of the template (dataset, trace, simulation) inherits the setting unless overridden. + +![Eval create page with the Error Localization toggle in the on position, shown as its own section below the Output Type configuration](/images/docs/evaluation/error-localization/toggle-on.png) + +### Per application + +When applying an eval to a dataset, trace project, or simulation, you can override the template's setting just for this binding. Useful when you want extra detail on one investigation without changing the template default. + +--- + +## How it runs + +Error Localization runs **after** the main eval, only for rows that failed. So: + +- Rows that pass the eval do not trigger Error Localization. No extra cost on passes. +- Failed rows queue an additional analysis job after the main verdict is recorded. + +The status progresses through: + +| Status | What it means | +|---|---| +| `pending` | The row failed; localization is queued. | +| `running` | Analysis is in progress. | +| `completed` | Analysis finished; selected key and analysis are visible. | +| `failed` | The localizer itself errored. The eval's verdict is unchanged; only the extra analysis is missing. | +| `skipped` | The row didn't qualify (typically a Code eval failure or a single-input eval). | + +The dataset / trace / simulation row shows the verdict immediately and the error analysis arrives shortly after. + +--- + +## What's not localized + +Error Localization does not run on: + +- **Passing rows.** Only failures trigger it. +- **Code evals.** No model trace exists for the localizer to read. +- **Rows where the eval itself errored** (timeout, malformed input). The status of those rows is `Failed` (eval-level), distinct from a localized failure. + +If a template's eval type is `code`, the toggle is hidden in the create page. + +--- + +## Cost considerations + +Error Localization adds an extra LLM call per failed row. The cost scales with how many rows fail, not with how many rows you eval. A pass-rate-95% dataset with 1000 rows pays for ~50 localization calls; a pass-rate-50% dataset pays for ~500. + +Trade-offs: + +- Leave it on for low-volume datasets where each failure matters +- Leave it off for high-volume continuous trace evals where pass rate is high and individual failures are reviewed less often +- Turn it on per investigation when you're actively debugging + +--- + +## Apply via the SDK + +When applying an eval to a dataset programmatically, set `error_localizer: true` on the apply request. + +```python +import requests + +requests.post( + "https://api.futureagi.com/model-hub/develops/{dataset_id}/add_user_eval/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, + json={ + "name": "groundedness-with-localization", + "template_id": "groundedness-template-uuid", + "config": { + "mapping": { + "output": "ai_response_column_uuid", + "context": "source_column_uuid", + }, + "reason_column": True, + }, + "error_localizer": True, + "run": True, + }, +) +``` + +For a standalone SDK eval, set `error_localizer=True`: + +```python +from fi.evals import Evaluator + +evaluator = Evaluator( + fi_api_key="YOUR_API_KEY", + fi_secret_key="YOUR_SECRET_KEY", +) + +result = evaluator.evaluate( + eval_templates="groundedness", + inputs={ + "output": "Paris is the capital of Germany.", + "context": "Paris is the capital of France.", + }, + model_name="turing_flash", + error_localizer=True, +) + +print(result.eval_results[0].output) # "Failed" +print(result.eval_results[0].reason) # main reason +print(result.eval_results[0].error_localizer) # selected_input_key + error_analysis +``` + +--- + +## Next Steps + + + + How verdicts and reasons are reported. + + + Toggle Error Localization when authoring a template. + + + Apply an eval with Error Localization to a dataset or trace. + + + Pair Error Localization with ground truth data for richer analysis. + + diff --git a/src/pages/docs/evaluation/features/evaluate.mdx b/src/pages/docs/evaluation/features/evaluate.mdx index 57a3ffe4..c5784a27 100644 --- a/src/pages/docs/evaluation/features/evaluate.mdx +++ b/src/pages/docs/evaluation/features/evaluate.mdx @@ -1,170 +1,339 @@ --- -title: "Evaluate via Platform and SDK in Future AGI" -description: "Run evaluations using the Future AGI platform UI or the Python SDK. Choose individual templates or batch runs for scalable model assessment." +title: "Evaluate via Platform and SDK" +description: "Apply an eval template to a dataset, trace project, simulation, or run it directly from the SDK. Covers mapping, overrides, and result review." --- ## About -Evaluation is how you measure whether your AI is actually doing what you want it to do. +Once you have an [eval template](/docs/evaluation/concepts/eval-templates) (built-in or custom), you apply it to your data. The platform supports five surfaces: -You give it an input (a prompt, a response, a conversation) and an eval scores it. The score tells you if the output was accurate, safe, on-topic, well-structured, or whatever quality you care about. Every evaluation returns a **result** (e.g. Passed / Failed, or a numeric score), and a **reason** explaining why. +| Surface | What you evaluate | +|---|---| +| **Dataset** | Every row in a dataset. Results show as new columns. | +| **Trace project** | Spans, traces, or sessions captured from your AI app. | +| **Simulation** | Calls produced by a simulated agent run. | +| **Eval Playground** | A single ad-hoc input you type or paste. | +| **SDK** | Anything you can pass to a Python or TypeScript function. | -You can run evaluations two ways: +The same templates work across all five. The only thing that differs is the mapping: on a dataset you map to columns, on a trace you map to span attributes, on a simulation you map to call fields. The verdict format stays the same. -- **Platform UI**: point-and-click on a dataset. No code required. -- **Python SDK**: call `evaluator.evaluate()` from your code, scripts, or CI pipeline. +This page covers the dataset and SDK paths in detail. For the others see [Test playground](/docs/evaluation/features/test-playground), [Trace evaluation](/docs/observe/features/evals), and [Simulation evaluation](/docs/quickstart/running-evals-in-simulation). -Both support the same [built-in eval templates](/docs/evaluation/builtin) (e.g. Toxicity, Groundedness, Tone) and any custom evals you've defined. +--- + +## When to use which surface + +- **Dataset:** Offline evaluation, batch quality checks, regression suites against a fixed set of inputs. +- **Trace project:** Live and historical evaluation of production traffic. Score what your app actually did. +- **Simulation:** Pre-production testing where you control the inputs but want to evaluate the agent's behaviour. +- **Eval Playground:** Quick sanity check while authoring a template. +- **SDK:** Integrate evaluation into your own scripts, CI pipelines, or applications. --- -## When to use +## Apply to a dataset (UI) + + + + + +Open the dataset you want to evaluate. If you don't have one yet, see [Create a dataset](/docs/dataset). + +![Populated dataset open in the dashboard with the Evaluate button at the top right of the data grid toolbar](/images/docs/evaluation/evaluate/dataset-page.png) + + + + + +Click **Evaluate** in the top-right of the dataset view. The eval picker drawer opens. + + + + + +Browse or search the eval list. You'll see: + +- **Built-in evals** like Toxicity, Groundedness, Tone (read-only, you can duplicate them) +- **Custom evals** authored in your workspace + +Click the eval to open its config panel. + +![Eval picker drawer with search box, tag filters, and a list of available built-in and custom evals](/images/docs/evaluation/evaluate/add-evaluation.png) + +Click an eval in the list to expand it and preview the criteria, required variables, and default settings before adding it. + + + + + +Map each variable the eval expects to a column in your dataset. + +For example, a `groundedness` eval expects `output` and `context`. Pick which dataset column maps to each. + +![Configured evals panel showing each eval mapped to dataset columns and a Run All button at the bottom](/images/docs/evaluation/evaluate/configured-evals-panel.png) -- **Catch regressions before they ship**: Run evals in CI so a bad prompt change or model update gets flagged before it reaches production. -- **Score outputs at scale**: Attach evals to a dataset and every row gets a score automatically, without reviewing each one manually. -- **Enforce safety and compliance**: Check every response for toxicity, PII, bias, or data privacy issues as part of your standard pipeline. -- **Compare models or prompts**: Evaluate the same inputs across different models or prompt variations to see which performs better on your criteria. -- **Monitor quality over time**: Run the same evals repeatedly to track whether your AI's output quality is improving or degrading. + + + + +Most settings on the template are overridable per-application. Common overrides: + +| Override | What it does | +|---|---| +| **Model** | Use a different judge model than the template's default. | +| **Mode** (Agent evals) | Switch between `Quick`, `Auto`, and `Agent`. | +| **Use Internet, Connectors, Knowledge Bases** (Agent evals) | Adjust agent capabilities for this dataset. | +| **Context** | Pick which [context options](/docs/evaluation/concepts/data-injection) the eval gets. | +| **Pass threshold** | Tighten or loosen the score cutoff. | +| **Error Localization** | Turn on to flag the offending field per failed row. | +| **Reason column** | Add a second column with the eval's explanation per row. | + +These overrides apply only to this dataset. The template stays unchanged. Other datasets using the same template are not affected. + + + + + +Click **Add & Run**. The platform queues an eval job that processes every row. Each row gets a verdict and (if Reason column is on) a reason. + +A new column appears on the dataset for the eval result. Aggregates show in the eval summary at the top. + +![Dataset with eval results populated per row, an average score at the bottom, and a click-through detail popup showing the eval's reason for one row](/images/docs/evaluation/evaluate/dataset-with-results-running.png) + + + + --- -## How to - -Choose **UI** or **SDK** below; each tab has the process in steps. - - - - - - You need a dataset to run evals from the UI. If you don’t have one, add a dataset first. See [Dataset overview](/docs/dataset). - ![Select a dataset](/screenshot/product/evaluation/evaluate/1.png) - - - Open your dataset, then click **Evaluate** in the top-right. The evaluation configuration panel opens. - ![Open the evaluation panel](/screenshot/product/evaluation/evaluate/2.png) - - - Click **Add Evaluation**. Choose a **built-in template** (e.g. Tone) or click **Create your own eval**. For a built-in template: click it, give it a **name**, and under **config** select the **dataset column(s)** to use as input (and output if required). - ![Add and run an eval](/screenshot/product/evaluation/evaluate/3.png) - ![Add and run an eval](/screenshot/product/evaluation/evaluate/4.png) - - - Optionally enable **Error Localization** to pinpoint which part of a row caused a failure. Select a **model** if the template requires one. Click **Add & Run** to score every row in the dataset. - ![Add and run an eval](/screenshot/product/evaluation/evaluate/5.png) - - - From the Add Evaluation flow, click **Create your own eval** to define a custom template (name, model, rule prompt, output type, and optional settings). After you save it, the new eval appears in the evaluation list and you can add it to your dataset as in the step above. For full details on creating and configuring custom evals, see [Create custom evals](/docs/evaluation/features/custom). - - - - - - Some evals can run without an API key using the standalone `evaluate()` function, including local metrics like `contains`, `faithfulness`, and LLM-as-judge. See the SDK reference for details. - - - - Install the package **ai-evaluation** and create an `Evaluator` with your Future AGI API key and secret. Prefer setting **FI_API_KEY** and **FI_SECRET_KEY** in the environment instead of passing them in code. See [Accessing API keys](/docs/admin-settings#accessing-api-keys). - - ```bash - pip install ai-evaluation - ``` - - ```python - from fi.evals import Evaluator - - evaluator = Evaluator( - fi_api_key="your_api_key", - fi_secret_key="your_secret_key", - ) - ``` - - - Call `evaluate` with the eval template **name** (e.g. `tone`), **inputs** (dict with the keys the template expects, e.g. `"input"`), and **model_name**. Many built-in (system) templates require a model. - - ```python - result = evaluator.evaluate( - eval_templates="tone", - inputs={ - "input": "Dear Sir, I hope this email finds you well. I look forward to any insights or advice you might have whenever you have a free moment" - }, - model_name="turing_flash", - ) - print(result.eval_results[0].output) - print(result.eval_results[0].reason) - ``` - - - For long-running or large runs, set `is_async=True`. The call returns immediately with an **eval_id**; the evaluation runs in the background. - - ```python - result = evaluator.evaluate( - eval_templates="tone", - inputs={"input": "Your text here"}, - model_name="turing_flash", - is_async=True, - ) - eval_id = result.eval_results[0].eval_id - ``` - - - Use `get_eval_result(eval_id)` to fetch the result when the evaluation has finished. The same method works for both sync and async runs (e.g. to re-fetch a result). - - ```python - result = evaluator.get_eval_result(eval_id) - print(result.eval_results[0].output) - print(result.eval_results[0].reason) - ``` - - - To use a template you created in the UI, pass its **name** as `eval_templates` and supply the **inputs** dict with the keys your template’s required_keys expect (e.g. `"input"`, `"output"`). Use the same template name you see in the evaluation list. - - ```python - result = evaluator.evaluate( - eval_templates="name-of-your-eval", - inputs={ - "input": "your_input_text", - "output": "your_output_text" - }, - model_name="model_name" - ) - - print(result.eval_results[0].output) - print(result.eval_results[0].reason) - ``` - - - - - For **system (built-in)** eval templates, **model_name** is required and must be one of the models listed for that template. The backend validates required input keys from the template’s config. - - - - - -For more eval templates and Future AGI models, see [Built-in evals](/docs/evaluation/builtin) and [Future AGI models](/docs/evaluation/features/futureagi-models). - +## Apply to a dataset (SDK) + +Use this when you want to script eval runs as part of an offline pipeline. + + + +```python title="Python" +from fi.datasets import Dataset + +# Load the dataset +dataset = Dataset.get("my-dataset") + +# Add an eval to it (creates the column and queues the run) +dataset.add_evaluation( + name="response_groundedness", + eval_template="groundedness", + required_keys_to_column_names={ + "output": "ai_response", + "context": "source_document", + }, + run=True, +) + +# Get aggregate stats +stats = dataset.get_eval_stats() +print(stats) +``` + +```typescript title="TypeScript" +const response = await fetch( + `https://api.futureagi.com/model-hub/develops/${datasetId}/add_user_eval/`, + { + method: "POST", + headers: { + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + "Content-Type": "application/json", + }, + body: JSON.stringify({ + name: "response_groundedness", + template_id: "groundedness-template-uuid", + config: { + mapping: { + output: "column-uuid-for-ai-response", + context: "column-uuid-for-source", + }, + reason_column: true, + }, + run: true, + }), + }, +); + +console.log(await response.json()); +``` + +```bash title="cURL" +curl -X POST "https://api.futureagi.com/model-hub/develops/{dataset_id}/add_user_eval/" \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "response_groundedness", + "template_id": "groundedness-template-uuid", + "config": { + "mapping": { + "output": "column-uuid-for-ai-response", + "context": "column-uuid-for-source" + }, + "reason_column": true + }, + "run": true + }' +``` + + + +--- + +## Run a single eval (SDK) + +Use this when you have an input in code and want to evaluate it without a dataset. + + + +```python title="Python" +from fi.evals import Evaluator + +evaluator = Evaluator( + fi_api_key="YOUR_API_KEY", + fi_secret_key="YOUR_SECRET_KEY", +) + +result = evaluator.evaluate( + eval_templates="toxicity", + inputs={"output": "You're awesome at this!"}, + model_name="turing_flash", +) + +print(result.eval_results[0].output) # "Passed" +print(result.eval_results[0].reason) +``` + +```typescript title="TypeScript" +import { Evaluator } from "@future-agi/ai-evaluation"; + +const evaluator = new Evaluator(); + +const result = await evaluator.evaluate( + "toxicity", + { output: "You're awesome at this!" }, + { modelName: "turing_flash" }, +); + +console.log(result); +``` + +```python title="Async" +# For long-running or large-batch runs +result = evaluator.evaluate( + eval_templates="toxicity", + inputs={"output": "..."}, + model_name="turing_flash", + is_async=True, +) +eval_id = result.eval_results[0].eval_id + +# Fetch when ready +result = evaluator.get_eval_result(eval_id) +print(result.eval_results[0].output) +``` + +```bash title="cURL" +curl -X POST https://api.futureagi.com/sdk/api/v1/new-eval/ \ + -H "X-Api-Key: YOUR_API_KEY" \ + -H "X-Secret-Key: YOUR_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "eval_templates": "toxicity", + "inputs": {"output": "You are awesome at this!"}, + "model_name": "turing_flash" + }' +``` + + + +The eval template can be a built-in name (`toxicity`, `groundedness`, `tone`, ...) or a custom template you created. + + +Some local metrics (like `contains`, `regex`, `bleu_score`) run client-side and don't need an API key. See [SDK reference](/docs/sdk/evals) for the full list. + + +--- + +## Run multiple evals at once (SDK) + +```python +results = evaluator.evaluate( + eval_templates=["toxicity", "groundedness", "tone"], + inputs={ + "output": "...", + "context": "...", + }, + model_name="turing_flash", +) + +for r in results.eval_results: + print(r.eval_template_name, r.output, r.reason) +``` + +--- + +## Reading the result + +A result has a fixed shape. The exact format of `output` depends on the eval's [output type](/docs/evaluation/concepts/output-types): + +| Output type | `output` is | +|---|---| +| **Pass/fail** | The string `"Passed"` or `"Failed"` | +| **Scoring** | An object: `{ "choice": "Good", "score": 0.7 }` | +| **Choices** | An object: `{ "choice": "Formal", "score": 1.0 }` (or a list when multi-choice) | + +```python +r = result.eval_results[0] + +r.output # the verdict +r.reason # plain-language explanation +r.runtime # seconds +r.model # the model that judged +r.eval_id # unique ID, used for async fetch +``` + +See [Eval results](/docs/evaluation/concepts/eval-results) for the full schema. + +--- + +## Common patterns + +### CI/CD gating + +Run a fixed eval suite on every pull request and fail the build when pass rate drops below threshold. See [Evaluate CI/CD pipeline](/docs/evaluation/features/cicd). + +### Compare prompts side by side + +Use [Experiments](/docs/dataset/features/experiments) to evaluate the same dataset with two different prompts and see the eval scores per variant. + +### Evaluate production traffic + +Attach evals to a [trace project](/docs/observe/features/evals) so every span captured from production gets scored automatically. --- ## Next Steps - - Define your own eval rules and criteria. + + Try an eval against a sample input before applying it. - - Run multiple evals together as a group. + + Pre-built templates ready to apply. - - Bring your own model for evaluations. + + Author your own template. - - Built-in models available for evals. + + Read and aggregate results. - Run evals automatically in your pipeline. - - - How evaluation fits into the platform. + Gate deploys on eval scores. diff --git a/src/pages/docs/evaluation/features/futureagi-models.mdx b/src/pages/docs/evaluation/features/futureagi-models.mdx index 7dce3c35..b244d6d0 100644 --- a/src/pages/docs/evaluation/features/futureagi-models.mdx +++ b/src/pages/docs/evaluation/features/futureagi-models.mdx @@ -15,26 +15,20 @@ All models are available in the platform UI and the SDK, and work with both buil ## Available models -- **TURING_LARGE** `turing_large`: Flagship evaluation model that delivers best-in-class accuracy across multimodal inputs (text, images, audio). Recommended when maximal precision outweighs latency constraints. +- **Turing Large** `turing_large`: Flagship evaluation model that delivers best-in-class accuracy across multimodal inputs (text, images, audio). Recommended when maximal precision outweighs latency constraints. -- **TURING_SMALL** `turing_small`: Compact variant that preserves high evaluation fidelity while lowering computational cost. Supports text and image evaluations. +- **Turing Small** `turing_small`: Compact variant that preserves high evaluation fidelity while lowering computational cost. Supports text and image evaluations. -- **TURING_FLASH** `turing_flash`: Latency-optimised version of TURING, providing high-accuracy assessments for text and image inputs with fast response times. - -- **PROTECT** `protect`: Real-time guardrailing model for safety, policy compliance, and content-risk detection. Offers very low latency on text and audio streams and permits user-defined rule sets. - -- **PROTECT_FLASH** `protect_flash`: Ultra-fast binary guardrail for text content. Designed for first-pass filtering where millisecond-level turnaround is critical. +- **Turing Flash** `turing_flash`: Latency-optimised model providing high-accuracy assessments for text and image inputs with fast response times. Use for high-volume runs. --- ### Quick comparison | Model | Code | Inputs | Best for | Latency | | --- | --- | --- | --- | --- | -| TURING_LARGE | `turing_large` | Text, image, audio | Max accuracy, multimodal evals | Higher | -| TURING_SMALL | `turing_small` | Text, image | High fidelity, lower cost | Medium | -| TURING_FLASH | `turing_flash` | Text, image | Fast, high-accuracy evals | Low | -| PROTECT | `protect` | Text, audio | Safety, guardrails, user-defined rules | Low | -| PROTECT_FLASH | `protect_flash` | Text | First-pass binary filtering | Ultra-low | +| Turing Large | `turing_large` | Text, image, audio | Max accuracy, multimodal evals | Higher | +| Turing Small | `turing_small` | Text, image | High fidelity, lower cost | Medium | +| Turing Flash | `turing_flash` | Text, image | Fast, high-accuracy evals | Low | --- @@ -52,7 +46,7 @@ All models are available in the platform UI and the SDK, and work with both buil - Pass `model_name` in your `evaluator.evaluate()` call. Use the model code from the table above (e.g. `turing_flash`, `turing_large`, `protect`). + Pass `model_name` in your `evaluator.evaluate()` call. Use the model code from the table above (e.g. `turing_flash`, `turing_large`). ```python from fi.evals import Evaluator @@ -61,7 +55,7 @@ All models are available in the platform UI and the SDK, and work with both buil result = evaluator.evaluate( eval_templates="tone", inputs={"input": "Your text to evaluate."}, - model_name="turing_small", # or turing_flash, turing_large, protect, protect_flash + model_name="turing_small", # or turing_flash, turing_large ) ``` @@ -80,8 +74,8 @@ All models are available in the platform UI and the SDK, and work with both buil Define your own eval rules and choose a model to run them. - - Run multiple evals together as a group. + + Combine multiple checks into a single composite score. Bring your own model for evaluations. diff --git a/src/pages/docs/evaluation/features/ground-truth.mdx b/src/pages/docs/evaluation/features/ground-truth.mdx new file mode 100644 index 00000000..82968f65 --- /dev/null +++ b/src/pages/docs/evaluation/features/ground-truth.mdx @@ -0,0 +1,198 @@ +--- +title: "Ground Truth" +description: "Provide labelled examples for an eval template so the judge sees concrete examples of how to score similar inputs. Used as few-shot context, retrieved by semantic similarity at run time." +--- + +## About + +Ground Truth is a set of labelled examples attached to an eval template. When the eval runs on a new row, the platform retrieves the most similar examples from the ground truth set and includes them in the judge's prompt. This shows the judge concrete examples of the kind of verdict you expect, instead of relying only on the criteria you wrote. + +Ground truth is the right tool when criteria alone don't pin down the verdict precisely enough. Instead of writing more rules, you give the judge a few labelled cases and let it generalise. + +This applies to **LLM-As-A-Judge** and **Agent** evals. Code evals don't use ground truth (no judge to prompt). + +--- + +## When to use + +- Your custom criteria produces inconsistent verdicts across runs and you want to anchor the judge with examples +- You have human-labelled examples already (from QA, support, or annotation) and want to reuse them +- Verdicts depend on tone or style that's hard to write as rules but easy to show with examples +- A built-in eval is close to what you need but its default verdicts disagree with yours on edge cases + +When **not** to use: + +- The criteria is fully specified by the rule prompt and works consistently +- You only have a handful of examples (under 5) — too few to retrieve meaningfully + +--- + +## How it works + +1. You upload a labelled dataset (CSV, Excel, or JSON) and attach it to the template. +2. The platform embeds each row so similar rows can be retrieved. +3. At eval time, for each new input, the platform retrieves the most similar ground truth rows. +4. Those rows are added to the judge's prompt as few-shot examples. +5. The judge weighs the criteria and the examples together when producing the verdict. + +The number of examples retrieved is small (typically 3-5) so the prompt stays focused. + +--- + +## Add ground truth from the UI + + + + + +Open the eval template's detail page and switch to the **Ground Truth** tab. + +![Eval detail page with the Ground Truth tab selected, showing the empty state and the Add Ground Truth dialog open on the right](/images/docs/evaluation/ground-truth/tab-selected.png) + + + + + +Click **Add Ground Truth**. You have two options: + +- **Upload a file** — CSV, Excel (`.xls`, `.xlsx`), or JSON. Max 50 MB. +- **Choose an existing dataset** — pick from datasets already in your workspace. + +![Add Ground Truth dialog at the Map Variables step, showing the selected dataset, ground truth name, and variable mapping for {{output}}](/images/docs/evaluation/ground-truth/map-variables.png) + + + + + +Once the file is loaded, you map its columns to the eval's variables. + +| Setting | What it does | +|---|---| +| **Variable mapping** | Maps each `{{variable}}` in your eval criteria to a column in the ground truth file. The values from those columns become the few-shot examples' inputs. | +| **Role mapping** | Maps semantic roles (e.g. `user`, `assistant`, `expected`) to columns. Used when the few-shot examples need to be formatted as a conversation. | + +Example: a `groundedness` eval with variables `output` and `context` plus a labelled `verdict` column. Map `output → response_col`, `context → source_col`, and the verdict label to the role mapping so the example shows the expected outcome. + +![Configured ground truth page showing Variable Mapping, Role Mapping, and embedding generation status alongside a data preview](/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png) + + + + + +After save, the platform embeds each row. The status moves through: + +| Status | What it means | +|---|---| +| `Pending` | Queued for embedding. | +| `Embedding...` | Being processed. | +| `Completed` | Ready to use. New eval runs will retrieve from this set. | +| `Failed` | Something went wrong. The eval still runs but without ground truth. | + +You can run the eval as soon as the status is **Completed**. + + + + + +--- + +## Manage from the SDK + +Ground truth is also available via the API. + + + +```python title="Upload" +import requests + +with open("labelled_examples.csv", "rb") as f: + response = requests.post( + f"https://api.futureagi.com/model-hub/eval-templates/{template_id}/ground-truth/upload/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, + files={"file": f}, + data={"name": "groundedness-examples"}, + ) + +ground_truth_id = response.json()["id"] +``` + +```python title="Set variable mapping" +requests.put( + f"https://api.futureagi.com/model-hub/ground-truth/{ground_truth_id}/mapping/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, + json={ + "variable_mapping": { + "output": "response_col", + "context": "source_col", + } + }, +) +``` + +```python title="Trigger embedding" +requests.post( + f"https://api.futureagi.com/model-hub/ground-truth/{ground_truth_id}/embed/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, +) +``` + +```python title="Check status" +response = requests.get( + f"https://api.futureagi.com/model-hub/ground-truth/{ground_truth_id}/status/", + headers={ + "X-Api-Key": "YOUR_API_KEY", + "X-Secret-Key": "YOUR_SECRET_KEY", + }, +) +print(response.json()) # {"embedding_status": "completed", "embedded_row_count": 124} +``` + + + +--- + +## Tips + +- **Quality beats quantity.** A small set of high-quality, well-labelled examples helps more than a large set of noisy ones. Aim for 20-100 to start. +- **Cover the failure modes.** Include examples of the cases your eval gets wrong without ground truth, not just easy passes. +- **Keep ground truth and criteria aligned.** If the examples disagree with the rule prompt, the judge gets confused. Update both together when you change the eval's behaviour. +- **Re-embed when you change the file.** Uploads create a new ground truth record; the old one isn't auto-updated. Delete the old one or pick the new one in the eval config. + +--- + +## What it costs + +Ground truth adds a few extra tokens to each eval call (the retrieved examples). Typical impact: + +- Embedding cost: one-time, small (depends on ground truth size). +- Per-eval cost: slightly higher prompt tokens. The exact amount depends on how many examples are retrieved (usually 3-5) and their length. + +For high-volume runs, prefer concise examples to keep the per-row cost down. + +--- + +## Next Steps + + + + Author the template that ground truth attaches to. + + + Templates and the role of ground truth in their config. + + + Test the eval with ground truth on before saving. + + + Pair ground truth with localization for deeper failure analysis. + + diff --git a/src/pages/docs/evaluation/features/test-playground.mdx b/src/pages/docs/evaluation/features/test-playground.mdx new file mode 100644 index 00000000..669e1986 --- /dev/null +++ b/src/pages/docs/evaluation/features/test-playground.mdx @@ -0,0 +1,141 @@ +--- +title: "Test Playground" +description: "Try an eval template against a single input, a dataset row, a captured span, or a simulation call before committing it to a dataset or trace project." +--- + +## About + +The Test playground runs an eval against a sample input and shows you the verdict and reason without saving anything. It's the right place to validate a template while you author it: quick to iterate, no consequences. + +The playground is built into the eval create and detail pages. You don't navigate to it separately. + +--- + +## When to use + +- While authoring a custom eval, before saving the first version +- After editing a template, to confirm a change improves verdicts +- When debugging a row that produced an unexpected verdict +- When checking how a built-in eval behaves on data that looks like yours + +The Test playground does not write to your dataset, trace project, or simulation. Each test is independent. + +--- + +## Four source modes + +The playground accepts test data from four sources. Pick the source that matches where you intend to apply the eval. + +| Source tab | What you provide | +|---|---| +| **Custom** | Type or paste a JSON object with the variables the eval expects. Default tab. | +| **Dataset** | Pick a dataset and a specific row. The row's columns become the input. | +| **Tracing** | Pick a trace project and a specific span, trace, session, or voice call. The captured data becomes the input. | +| **Simulation** | Pick a simulation run and a specific call. The call's transcript and metadata become the input. | + +![Test playground panel showing the Dataset, Tracing, Simulation, and Custom source tabs at the top, with the Custom tab selected and a JSON editor below](/images/docs/evaluation/test-playground/custom-tab.png) + +--- + +## Custom mode + +Type or paste JSON directly. Useful for quick checks without needing a dataset or trace. + +The playground reads the variables from your eval template and shows a mapping section: drop the right value into each variable. You can also paste a JSON blob and pick which key maps to which variable. + +A Falcon AI button (bottom-right of the JSON area) opens a generate-test-data prompt: describe the kind of input you want and the platform produces a JSON sample that fits the eval's variables. Useful for spinning up edge cases you wouldn't think of by hand. + +--- + +## Dataset mode + +Pick a dataset, then pick a row. The row's column values become the input the eval sees. + +Use this when you want to test the eval against the same data shape it will see in production. The mapping is the same one you'll use when applying the eval to the full dataset. + +Each variable on the eval gets a dropdown to pick which dataset column to use. Once mapped, the row's values fill in automatically. + +--- + +## Tracing mode + +Pick a trace project, then pick a row type (Span, Trace, Session, or Voice Call) and a specific item. The captured data becomes the input. + +This is the right mode when you're authoring an eval to attach to a trace project. The variables map to span attributes (e.g. `output` → `gen_ai.completion`). + +Tracing mode is the only place to see how an eval behaves with span context, full trace context, or session context turned on, before committing to those settings on the template. + +--- + +## Simulation mode + +Pick a simulation run, then a specific call. The call's transcript, recording, and scenario become the input. + +Use this when authoring an eval that scores agent calls. Built-in evals like `Customer Agent Conversation Quality`, `Loop Detection`, and `Termination Handling` all run in this mode. + +--- + +## Run a test + +Click **Test Evaluation** to evaluate the test data with the current template settings. The result panel shows: + +- **Verdict** in the format dictated by the [output type](/docs/evaluation/concepts/output-types) (Passed / Failed, score + label, or label) +- **Reason** explaining the verdict +- **Runtime** in seconds +- **Model** that judged + +For Agent evals, the result panel also shows the steps the agent took, including any tool calls or knowledge base searches. Useful for debugging why a verdict went one way or another. + +![Test playground result for a toxicity eval running on an audio dataset row, showing the verdict, the explanation, the variable mapping, and the Test Evaluation button at the bottom](/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png) + +--- + +## Test vs Save + +| Action | What it does | +|---|---| +| **Test** | Runs the eval with current settings against the test data. Nothing is persisted. Each Test click is independent. | +| **Save** | Writes the current settings to the template, creating a new version. The new default applies to new uses; existing applications stay on whichever version they pinned to. See [Versioning](/docs/evaluation/concepts/versioning). | + +Test until the verdicts look right. Then Save. + +--- + +## What carries over + +The settings you change on the page apply to the test: + +- Eval type (Agents / LLM-As-A-Judge / Code) and the criteria +- Model +- Mode, Use Internet, Connectors, Knowledge Bases, Context (for Agent evals) +- Output type, choice scores, pass threshold +- Few-shot examples (for LLM-As-A-Judge) + +This means you can edit the criteria, click Test, see the new verdict, edit again, click Test again, all without committing. Save when you're done. + +--- + +## Limitations + +- Test runs are not persisted and don't count toward dataset or trace eval results. +- Test results are not aggregated. Run a few different inputs to get a sense of consistency. +- Local code evals run in the same sandbox as production, so any environment limits apply (CPU, memory, allowed modules). + +--- + +## Next Steps + + + + Author a template and test it as you go. + + + Apply a tested eval to a dataset or trace project. + + + Save creates a new version. Old versions stay for rollback. + + + Pick the output type before testing. + + diff --git a/src/pages/docs/evaluation/index.mdx b/src/pages/docs/evaluation/index.mdx index dd526cb4..fca375a2 100644 --- a/src/pages/docs/evaluation/index.mdx +++ b/src/pages/docs/evaluation/index.mdx @@ -11,7 +11,7 @@ There are two building blocks: **eval templates** define what to measure (task c Evaluations run across every surface in Future AGI: datasets, simulations, experiments, playground, replay sessions, and CI/CD pipelines. You can also run them programmatically via the SDK. Using the same templates and configs across contexts keeps results directly comparable without redefining your quality criteria each time. -Future AGI ships 70+ built-in templates covering quality, safety, factuality, RAG retrieval, format, bias, audio, and image evaluation. You can also create custom templates and bundle any combination into **eval groups** to apply multiple evals in a single run. +Future AGI ships 70+ built-in templates covering quality, safety, factuality, RAG retrieval, format, bias, audio, and image evaluation. You can also create custom templates of three types ([Agents, LLM-As-A-Judge, or Code](/docs/evaluation/concepts/eval-types)) and bundle several into a [composite eval](/docs/evaluation/concepts/composite-evals) when you want a single combined verdict. ## How Evaluation Connects to Other Features @@ -27,17 +27,23 @@ Future AGI ships 70+ built-in templates covering quality, safety, factuality, RA Run the first eval from the UI or SDK in minutes. + + Define your own eval rules in any of the three types. + - 70+ templates: quality, safety, factuality, RAG, and more. + 70+ templates across quality, safety, factuality, RAG, and more. - - Define your own eval rules and output types. + + Try a template against a row, span, simulation, or custom JSON before applying it. + + + Anchor judges with labelled examples retrieved at run time as few-shot context. - - Bundle multiple evals and run them together. + + Pinpoint which input field caused a row to fail. - Pick the right evaluation model for your task. + Pick the right judge model for your check. Run evals automatically on every pull request. diff --git a/src/pages/docs/observe/features/evals.mdx b/src/pages/docs/observe/features/evals.mdx index 24225eca..b6dd8992 100644 --- a/src/pages/docs/observe/features/evals.mdx +++ b/src/pages/docs/observe/features/evals.mdx @@ -1,79 +1,165 @@ --- -title: "Run Evals on Traces in Future AGI Observe" -description: "Run automated quality checks on traced spans in Observe: filter spans, choose historic or continuous runs, set sampling, and attach preset or custom evals." +title: "Run Evals on Traces" +description: "Configure eval tasks on a tracing project. Tasks run an eval template against matching spans on a schedule, with filters, sampling, and historical or continuous run types." --- ## About -Evals run automated quality checks on your production traces, scoring every LLM response for hallucination, tone, bias, toxicity, and more. You configure which checks to run, filter which spans they apply to, and choose whether to evaluate historical data or new spans as they arrive. Results appear per span in the Observe dashboard and can trigger alerts when quality drops. +Evals on traces score what your AI app actually did in production or staging. You configure an **eval task** on a tracing project; the task runs an eval template against matching spans (or traces, sessions, or voice calls) on the schedule you set. -{/* ARCADE EMBED START */} - -
-{/* ARCADE EMBED END */} +The same eval templates used for datasets and simulations work on traces. The difference is the mapping: you map variables to span attributes instead of dataset columns. --- ## When to use -- **Scoring production output quality**: Run historic evals after a release to check for hallucinations, bias, or unsafe content across real traffic. -- **Catching regressions in production**: Set up a continuous eval task so new spans are scored automatically and you see quality drops before users report them. -- **Spot-checking a specific time window**: Filter by date range or session to evaluate only the spans from an incident or a specific user flow. -- **Controlling eval cost**: Use sampling rate and span limits to evaluate a representative subset instead of every span. -- **Running multiple quality checks at once**: Attach several evals to one task so each span gets scored for tone, safety, and accuracy in a single run. +- Scoring production traffic for hallucinations, toxicity, groundedness, or custom rules +- Catching quality regressions across releases by running the same checks continuously +- Spot-checking spans from a specific time window or session after an incident +- Running multiple checks at once over matching spans, with sampling and filters to control cost --- -## How to +## Two places to start + +You can manage tasks two ways: + +| Entry point | What it shows | +|---|---| +| **Tasks** in the sidebar (`/dashboard/tasks`) | Every task across every project. Useful for reviewing all your eval tasks in one place. | +| Inside a project: **Evals & Tasks** tab | Tasks scoped to that one project. Useful when you're working on a single project. | + +Both pages list the same tasks; the project-scoped view is filtered. Task creation and editing work the same from either entry. + +--- + +## Create a task - - Define filters so the task runs only on the spans you care about. - ![Set filters](/images/docs/observe/1.png) + + +From the Tasks list, click **Create Task**. The task configuration page opens with four sections: Basic Info, Evaluations, Filters, and Scheduling. + +![Task configuration page showing the Basic Info, Evaluations, Filters, and Scheduling sections plus the Live Preview panel on the right](/images/docs/observe/evals/task-config-page.png) + + + + + +| Field | What it does | +|---|---| +| **Task Name** | A name to identify the task in lists and logs. | +| **Project** | The tracing project the task runs against. Cannot be changed after creation. | + + + + + +In the **Evaluations** section, click **Add Evaluation** to open the eval picker. + +In the picker, browse or search for an eval. Pick one, then map each of its variables to a span attribute path. Common attributes: + +| Attribute path | What it holds | +|---|---| +| `gen_ai.input.messages.0.message.content` | The first user message in an LLM span | +| `gen_ai.output.messages.0.message.content` | The model's response | +| `input.value` | Generic span input | +| `output.value` | Generic span output | + +Custom attributes you set on your spans appear here too. + +For Agent evals, you can also pick context options. On traces these are: + +| Option | What it injects | +|---|---| +| **Template variables** | Only mapped variables (default). | +| **Full span context** | Complete span data and metadata. | +| **Trace context** | The full trace tree with every span. | +| **Session context** | The full conversation across multiple traces. | + +See [Data injection](/docs/evaluation/concepts/data-injection) for cost trade-offs. + +You can add multiple evaluations to the same task. Each runs on every span the task processes. + +![Editing an eval inside a task: criteria, output type, and error localization on the left, and the variable mapping panel on the right](/images/docs/observe/evals/editing-eval-in-tasks-page.png) - | Filter | Description | - |--------|-------------| - | `observation_type` | Node/span type (e.g. `llm`, `chain`, `agent`). | - | `date_range` | Time range: `[start_date, end_date]` applied to `created_at`. | - | `created_at` | Minimum creation time (spans at or after this value). | - | `project_id` | Restrict to a specific Observe project. | - | `session_id` | Restrict to traces in a given session. | - | `span_attributes_filters` | List of span-attribute conditions. | + - Filters are stored in the task's `filters` field and applied when the task runs. - + - - Set the **run type**: +The **Filters** section narrows which rows in the project the task applies to. Pick a row type first (`Spans`, `Traces`, or `Sessions`), then add filters on top. - ![Choose run type](/images/docs/observe/2.png) +Common filters: - - **Historical**: Run on existing spans matching the filters, up to the sampling cap and span limit. The task completes after processing. - - **Continuous**: Run on new spans as they arrive. Each run only processes spans created after the last run; the task stays active for ongoing evaluation. - +| Filter | What it does | +|---|---| +| Time range | Restricts to rows in a `[start, end]` window. | +| Observation type | Restricts span types like `llm`, `chain`, `agent`. | +| Span attributes | Conditions on arbitrary span attribute values. | - - ![Set sampling rate and span limit](/images/docs/observe/3.png) +Filters are stored on the task and re-applied each run. - - **sampling_rate**: Percentage of matching spans to evaluate (0-100). For example, `50` evaluates 50% of filtered spans per run. - - **spans_limit**: Maximum number of spans to process per run (default 1000). The task stops when either the sampled count or this limit is reached. - + - - Attach one or more eval configs to the task. The task runs each selected eval on every span it processes. For evals that need an input (e.g. Bias Detection), set the **input key** to a span attribute path (e.g. `gen_ai.output.messages.0.message.content`) so the eval reads the right field from each span. See [built-in evals](/docs/evaluation/builtin) for supported evaluations and their required inputs. - + - - ![run](/images/docs/observe/4.png) +The **Scheduling** section controls when and how much data the task processes: + +| Setting | What it does | +|---|---| +| **Run mode** | `Historical data` (one-shot over existing rows in a past time window) or `New incoming data` (continuously evaluates each new row as it arrives). | +| **Sampling rate** | Percentage of matching rows to evaluate (0-100). Use to limit cost. | +| **Span limit** | Maximum rows to process per run. The task stops at the lower of sampled count and this cap. | + + + + + +Save the task. The task starts running according to the schedule you set. Tasks have a status lifecycle: + +| Status | What it means | +|---|---| +| `Pending` | Created, not yet started. | +| `Running` | Processing matching rows. | +| `Completed` | Historical-data run finished. | +| `Paused` | Temporarily stopped (new-incoming-data tasks). Resume to keep going. | +| `Failed` | Errored. Check the task's logs for the cause. | + + - Create or update the eval task via the API or UI, then run it. You can test the configuration before saving. Task status values: `pending`, `running`, `completed`, `failed`, `paused`, `deleted`. Results appear on the spans in the Observe dashboard and can be used for alerts. - - - Eval tasks are processed asynchronously. Status and results update as runs complete. For continuous tasks, new spans are picked up on subsequent runs. - +--- + +## Where results show up + +Eval results appear: + +- **On individual spans:** Each evaluated span has eval results visible in the span detail view. +- **On the trace project's Charts:** Aggregate charts include eval-based metrics like pass rate and score. +- **In Alerts and Monitors:** Eval scores can drive [Alerts](/docs/observe/features/alerts) so you get notified when pass rate drops. + +![Trace project view with eval scores attached as columns on each trace row, showing per-row verdicts and an Add Eval button at the top right](/images/docs/observe/evals/evals-results-observe-page.png) + +--- + +## Edit, pause, resume, or delete a task + +From the task list: + +- Click a task to open its detail view, where you can change evals, filters, or schedule. +- Continuous tasks can be **paused** to temporarily stop processing new rows, and **resumed** later. +- Tasks can be deleted from the list. Past results stay accessible. + +--- + +## Tips + +- **Use sampling on large projects.** Most quality issues show up in a representative sample. Sampling at 5-10% is usually enough to catch regressions. +- **Continuous + alerts** is the production pattern. A continuous task on critical spans, plus an alert when pass rate drops, catches regressions before users do. +- **Historical for incidents.** When something goes wrong, run a historical task with tight time and session filters to score just the affected window. +- **Trace context isn't free.** When an Agent eval needs to read the full trace, prompt size grows quickly. Start with span context, escalate only if verdicts come out wrong. --- @@ -86,10 +172,10 @@ Evals run automated quality checks on your production traces, scoring every LLM Group traces into sessions for multi-turn analysis. - - View activity and metrics per end user. - - Get notified when metrics cross a threshold. + Get notified when eval pass rates drop. + + + Span, trace, and session context options. diff --git a/src/pages/docs/quickstart/running-evals-in-simulation.mdx b/src/pages/docs/quickstart/running-evals-in-simulation.mdx index 72eb3e81..3b8c1edc 100644 --- a/src/pages/docs/quickstart/running-evals-in-simulation.mdx +++ b/src/pages/docs/quickstart/running-evals-in-simulation.mdx @@ -1,109 +1,147 @@ --- -title: "Running Evals in Simulation: Score Agent Interactions" -description: "Run evaluations in Future AGI simulations to test AI agents against simulated customers and score interactions for quality and context retention." +title: "Running Evals in Simulation" +description: "Score simulated agent calls against eval templates. Pick built-in evals or create your own, map call data to eval variables, and review results per scenario." --- ## About -**Simulation** is Future AGI's agent testing product. It lets you run your AI agent against simulated customers in realistic scenarios without real users, real calls, or production risk. You define who the customer is, what they want, and how they behave. The platform drives the conversation and scores every interaction using evaluations you configure. The result is a detailed breakdown of where your agent succeeds and where it fails, before you ship. +Simulation runs your AI agent against simulated customers in defined scenarios. Evals score what the agent did. Together they let you test agent quality before any real users are involved: define who the customer is, watch the agent handle the call, and see how it scored on the criteria you care about. + +The same eval templates that work everywhere else in Future AGI also work here. The only difference is what you map: instead of dataset columns or span attributes, you map to the call's transcript, recording, or scenario fields. --- -**Prerequisites:** Before starting, make sure you have set up your [Agent Definition](/docs/simulation/concepts/agent-definition), [Scenarios](/docs/simulation/concepts/scenarios), and [Personas](/docs/simulation/concepts/personas). +## Prerequisites + +Before you start, make sure you have: + +- An [Agent Definition](/docs/simulation/concepts/agent-definition) (sidebar: **Simulate** → **Agent Definition**) +- One or more [Scenarios](/docs/simulation/concepts/scenarios) (sidebar: **Simulate** → **Scenarios**) +- One or more [Personas](/docs/simulation/concepts/personas) (sidebar: **Simulate** → **Personas**) + +--- + +## Configure evals on a new simulation run - - Navigate to your simulation and click **Run Simulation**. You'll see the eval configuration panel where you can add evaluators before starting the run. - ![Run simulation dashboard](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image1.png) - + + +In the sidebar, click **Simulate** → **Run Simulation**. This is the list of all your runs. + + + + + +Click **Create a Simulation**. The configuration wizard opens. It has four steps: + +1. **Add simulation details** — name, agent definition, agent version +2. **Choose Scenario(s)** — pick the scenarios to run +3. **Select Evaluations** — pick the evals to score every call +4. **Summary** — review and start + + + + + +In the **Select Evaluations** step, click **Add Evaluations** to open the eval picker. - - Click **Add Evaluation** to open the eval drawer. Choose from Future AGI's built-in simulation evals or create a custom one. +You can pick built-in evals or custom evals. The built-in evals tuned for simulation are under the `agents` and `chatbot_behaviors` tags: - ![Eval drawer](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image2.png) +| Eval | What it scores | +|---|---| +| `customer_agent_conversation_quality` | Overall conversation quality | +| `customer_agent_query_handling` | Whether the agent interprets and answers correctly | +| `customer_agent_context_retention` | Whether the agent remembers earlier turns | +| `customer_agent_human_escalation` | Whether escalation to a human happens at the right time | +| `customer_agent_loop_detection` | Whether the agent gets stuck repeating itself | +| `customer_agent_termination_handling` | Whether the agent ends the call cleanly | - **Recommended built-in evals for simulation:** - - `customer_agent_conversation_quality` — overall conversation quality - - `customer_agent_query_handling` — correct interpretation and relevant answers - - `customer_agent_context_retention` — agent remembers earlier context - - `customer_agent_human_escalation` — appropriate escalation to a human - - `customer_agent_loop_detection` — detects repetitive or looping responses +Browse the [full built-in list](/docs/evaluation/builtin) for more. - See the full list of built-in evals [here](/docs/evaluation/builtin). - +![Select Evaluations step in the Create a Simulation wizard with the Add Evaluations button and the empty state below](/images/docs/simulation/add-evaluation-button.png) - - After selecting an eval, a configuration drawer opens. Fill in the required fields: +After clicking **Add Evaluations**, the picker drawer opens. Search for an eval by name (e.g. `customer_agent`) or filter by tag, then click an eval to expand its details and review the criteria before adding it. - ![Configure eval](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image3.png) +![Eval picker drawer in the simulation wizard with a customer_agent search and one eval expanded showing required keys, model, and criteria](/images/docs/simulation/search-evals.png) - - **Name**: displayed in your simulation dashboard after the run - - **Language Model**: recommended `TURING_LARGE` - - **Required Inputs**: map the eval's input keys to your simulation columns: - - `conversation` maps to `Mono Voice Recording` or `Stereo Recording` - - `input` maps to `person` or `situation` - - `output` maps to `Mono Voice Recording`, `Stereo Recording`, or `outcome` + - Click **Save Eval** when done. + - ![Save eval](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image5.png) - +For each eval you pick, the picker drawer opens for configuration: - - The saved eval appears under **Selected Evaluations**. You can add multiple evals to a single run to test the agent more broadly. +| Field | What it does | +|---|---| +| **Name** | Label shown in the simulation results. | +| **Model** | The judge model. Defaults to the template's setting. `turing_large` for highest accuracy on complex calls; `turing_flash` for cheaper high-volume runs. | +| **Variable mapping** | Map each `{{variable}}` to a call field. Voice runs expose recording fields (`stereo_recording`, `voice_recording`, `assistant_recording`, `customer_recording`) plus a transcript; text runs expose chat transcripts. The exact list depends on whether the run is voice or text. | +| **Context** (Agent evals) | What the eval sees beyond your mapped variables. `Call context` is the typical pick on simulation. See [Data injection](/docs/evaluation/concepts/data-injection). | +| **Mode** (Agent evals) | `Auto`, `Agent`, or `Quick`. | - ![Selected evaluations](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image6.png) - +Save the eval to add it to the run. You can add multiple evals; each runs independently on every call. - - Once you've added all the evals you need, click **Next** and then run the simulation. - +![Eval configuration view showing the criteria, model, output type, and the Variable Mapping panel mapping eval variables to call fields](/images/docs/simulation/map-variables-for-evals.png) + + + + + +Move to the **Summary** step, review the configuration, then click **Run Simulation** to start. Calls execute in parallel; evals run after each call completes. + + + + + +When the run finishes, the run detail page shows scores per scenario and per call. Click into a call to see the full transcript alongside each eval's verdict and reason. + +![Simulation results dashboard showing call details, system metrics, evaluation metrics aggregates, and a per-call grid with eval verdicts for each scenario](/images/docs/simulation/eval-results-page.png) + + - - After the simulation completes, your results appear in the simulation dashboard. Each scenario shows a score for every eval you configured. You can drill into individual conversations to see the full transcript and where the agent scored well or poorly. - --- -## Creating a Custom Eval +## Add evals to a run that already exists -If the built-in evals don't cover your use case, you can create your own. +To add or change evals on an existing run, open the run from the Run Simulation list, then use **Add Evaluation** in the run's evaluation panel. You can re-run only the evals (no need to re-run the calls themselves). - - - In the eval drawer, click **Create your own evals** and provide a unique name. +--- - ![Create custom eval](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image7.png) - +## Create a custom eval for simulation - - Select a model (recommended: `TURING_LARGE`) and write your evaluation criteria using `{{ }}` for input variables. +If the built-in evals don't cover what you need, create a custom one. Custom evals work the same in simulation as everywhere else; see [Create custom evals](/docs/evaluation/features/custom). - Example: *Given `{{conversation}}`, evaluate if the agent convinces the customer to purchase insurance.* +A few simulation-specific notes when authoring: - Map `{{conversation}}` to `Mono Voice Recording` or `Stereo Recording`. - +- **Agent type is usually right** for calls because it can read the full transcript and reason across turns. LLM-As-A-Judge also works for simpler binary checks. +- **Use `{{variable}}` placeholders** in the criteria. When you map them later, you'll point them at call fields (transcript, recording, scenario data). +- **Test before saving** with the [Test Playground](/docs/evaluation/features/test-playground) using the **Simulation** source mode to run the eval against an existing call. - - Choose how the eval should score results: - - **Pass/Fail** — recommended for most cases - - **Percentage** — specify what 0% means - - **Categorical** — define all possible output labels +--- - Click **Create Evaluation** to save it as a reusable template under **User Built** evals. - +## Tips - - Your custom eval now appears in the eval drawer. Select it, give it a run name, map the input columns, and click **Save Eval**. +- **Start with `Auto` mode** for Agent evals. Switch to `Agent` only if you see verdicts you disagree with, or to `Quick` if you're doing a high-volume sweep. +- **Pick the right model.** `turing_flash` is fine for most binary checks; `turing_large` for nuanced quality scoring. +- **Review individual calls.** The aggregate score tells you something is off, but the reasons on individual call verdicts tell you what to fix. - ![Custom eval saved](/screenshot/product/simulation/quickstart-running-evals-in-simulation/image9.png) - - +--- ## Next Steps -- [Browse all built-in evals](/docs/evaluation/builtin) to find metrics that fit your use case -- [Set up agent definitions](/docs/simulation/concepts/agent-definition) if you haven't already -- [Learn about simulation concepts](/docs/simulation) for a deeper understanding of how scenarios and personas work + + + Browse all built-in evals; many are agent-tuned. + + + Pick Agents, LLM-As-A-Judge, or Code. + + + Try an eval on a single call before applying. + + + Concepts: agent definitions, scenarios, personas. + +