fireflyframework · miguelgfierro · Jun 30, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.github/workflows/pr-gate.yml b/.github/workflows/pr-gate.yml
@@ -57,7 +57,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra openai-embeddings --extra evaluation
       - run: uv run pyright
 
   test:
@@ -72,7 +72,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: '3.13'
-      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra script-execution
+      - run: uv sync --extra dev --extra binary --extra vectorstores-sqlite-vec --extra vectorstores-pgvector --extra openai-embeddings --extra evaluation --extra script-execution
       - run: uv run pytest -m "not nightly" --cov --cov-report=term-missing
 
   build:

diff --git a/README.md b/README.md
@@ -318,6 +318,12 @@ create your own components; the framework discovers them via duck typing.
   `EvalDataset` loads/saves test cases from JSON. `ModelComparison` runs the
   same prompts across multiple agents for side-by-side analysis.
 
+- **Evaluation** — LLM-as-judge metrics (faithfulness, relevancy, answer correctness,
+  RAGAS, …) and deterministic retrieval metrics (recall@k, MRR, MAP, nDCG, …) for
+  assessing LLM and pipeline outputs. Each metric is a plain function you call directly.
+  Install with `pip install "fireflyframework-agentic[evaluation]"`.
+  See [docs/evaluation.md](docs/evaluation.md) for the full guide.
+
   > **Optional developer tooling.** `fireflyframework_agentic.experiments` (A/B
   > experiments) and `fireflyframework_agentic.lab` (offline evaluation /
   > benchmarking) are leaf modules — nothing in the core imports them and they add
@@ -751,6 +757,7 @@ Detailed guides for each module:
 - [Secure Script Execution](docs/execution.md) — Deny-by-default Monty sandbox, static safety pre-screen, `SecureScriptRunner`, Firefly Code Mode
 - [Experiments](docs/experiments.md) — A/B testing, variant comparison
 - [Lab](docs/lab.md) — Benchmarks, datasets, evaluators
+- [Evaluation](docs/evaluation.md) — LLM-as-judge metrics, RAGAS, retrieval metrics
 - Studio — moved to [fireflyframework-agentic-studio](https://github.com/fireflyframework/fireflyframework-agentic-studio)
 ---
 

diff --git a/docs/evaluation.md b/docs/evaluation.md
diff --git a/examples/llm_eval_example.py b/examples/llm_eval_example.py
@@ -0,0 +1,119 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LLM-as-judge evaluation example.
+
+Score a set of Q&A pairs using the evaluation metrics:
+  - contains_answer  — does the answer contain the correct information?
+  - addresses_question — does the answer directly address what was asked?
+
+Usage::
+
+    python examples/llm_eval_example.py --model anthropic:claude-haiku-4-5
+
+    # Or score from a JSONL file instead of the built-in sample data:
+    python examples/llm_eval_example.py \\
+        --model anthropic:claude-haiku-4-5 \\
+        --items-file items.jsonl
+
+Items JSONL format — one JSON object per line::
+
+    {"question": "...", "answer": "...", "reference": "..."}
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+from pathlib import Path
+
+from fireflyframework_agentic.evaluation import (
+    EvalContext,
+    JudgeClient,
+    addresses_question,
+    contains_answer,
+)
+
+# Sample data used when no --items-file is provided.
+_SAMPLE_ITEMS = [
+    {
+        "question": "What is the boiling point of water at sea level?",
+        "reference": "Water boils at 100 °C at standard atmospheric pressure.",
+        "answer": "Water boils at 100 degrees Celsius at sea level.",
+    },
+    {
+        "question": "Who wrote Romeo and Juliet?",
+        "reference": "Romeo and Juliet was written by William Shakespeare around 1594–1596.",
+        "answer": "It was written by Shakespeare.",
+    },
+    {
+        "question": "What is the capital of France?",
+        "reference": "The capital of France is Paris.",
+        "answer": "The weather in France is generally mild.",
+    },
+]
+
+
+async def score_items(items: list[dict], ctx: EvalContext) -> list[dict]:
+    tasks = [(contains_answer(item, ctx), addresses_question(item, ctx)) for item in items]
+    pairs = await asyncio.gather(*[asyncio.gather(ca, aq) for ca, aq in tasks])
+    return [
+        {
+            "question": item["question"],
+            "contains_answer": ca["score"] if ca else None,
+            "addresses_question": aq["score"] if aq else None,
+        }
+        for item, (ca, aq) in zip(items, pairs, strict=True)
+    ]
+
+
+async def main(args: argparse.Namespace) -> None:
+    if args.items_file:
+        lines = Path(args.items_file).read_text(encoding="utf-8").strip().splitlines()
+        items = [json.loads(line) for line in lines if line.strip()]
+    else:
+        items = _SAMPLE_ITEMS
+
+    ctx = EvalContext(client=JudgeClient(args.model))
+    results = await score_items(items, ctx)
+
+    print(f"\n{'Question':<45} {'contains':>8} {'addresses':>9}")
+    print("-" * 63)
+    for r in results:
+        q = r["question"][:43] + ".." if len(r["question"]) > 45 else r["question"]
+        ca = f"{r['contains_answer']:.2f}" if r["contains_answer"] is not None else "  n/a"
+        aq = f"{r['addresses_question']:.2f}" if r["addresses_question"] is not None else "    n/a"
+        print(f"{q:<45} {ca:>8} {aq:>9}")
+
+    scored = [r for r in results if r["contains_answer"] is not None]
+    if scored:
+        avg_ca = sum(r["contains_answer"] for r in scored) / len(scored)
+        avg_aq = sum(r["addresses_question"] for r in scored) / len(scored)
+        print("-" * 63)
+        print(f"{'Average':<45} {avg_ca:>8.2f} {avg_aq:>9.2f}")
+    print(f"\n{len(items)} items scored.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Score Q&A pairs with LLM-as-judge metrics.")
+    parser.add_argument(
+        "--model",
+        default="anthropic:claude-haiku-4-5",
+        help="Judge model spec (provider:model).",
+    )
+    parser.add_argument(
+        "--items-file", default=None, help="Optional JSONL file of {question, answer, reference} items."
+    )
+    asyncio.run(main(parser.parse_args()))
diff --git a/fireflyframework_agentic/evaluation/__init__.py b/fireflyframework_agentic/evaluation/__init__.py
@@ -0,0 +1,91 @@
+"""Evaluation metrics for LLM and pipeline outputs.
+
+LLM-as-judge metrics (``judge``), the spec-driven embedder factory (``embedder``),
+and deterministic retrieval metrics (``retrieval_metrics``).
+"""
+
+from fireflyframework_agentic.evaluation.embedder import build_embedder
+from fireflyframework_agentic.evaluation.judge import (
+    BASIC_METRICS,
+    PROCESS_MINING_METRICS,
+    AdvisoryReport,
+    EvalContext,
+    JudgeClient,
+    Metric,
+    actionability,
+    addresses_question,
+    answer_correctness,
+    answer_relevancy,
+    citation_relevance,
+    comparative_vs_champion,
+    contains_answer,
+    context_precision,
+    context_recall,
+    contradiction,
+    excerpt_fill_rate,
+    fabricated_entity,
+    faithfulness,
+    nc_semantic_precision,
+    numeric_temporal_fidelity,
+    open_gap,
+    parse_model,
+    ragas_faithfulness,
+    run_judge,
+    same_provider,
+    semantic_recovery,
+    severity_calibration,
+    source_coverage,
+    surface_deduplication,
+)
+from fireflyframework_agentic.evaluation.retrieval_metrics import (
+    citation_precision,
+    hit_at_k,
+    map_score,
+    mrr,
+    ndcg,
+    no_answer_rate,
+    precision_at_k,
+    recall_at_k,
+)
+
+__all__ = [
+    "BASIC_METRICS",
+    "PROCESS_MINING_METRICS",
+    "AdvisoryReport",
+    "EvalContext",
+    "JudgeClient",
+    "Metric",
+    "actionability",
+    "addresses_question",
+    "answer_correctness",
+    "answer_relevancy",
+    "build_embedder",
+    "citation_precision",
+    "citation_relevance",
+    "comparative_vs_champion",
+    "contains_answer",
+    "context_precision",
+    "context_recall",
+    "contradiction",
+    "excerpt_fill_rate",
+    "fabricated_entity",
+    "faithfulness",
+    "hit_at_k",
+    "map_score",
+    "mrr",
+    "nc_semantic_precision",
+    "ndcg",
+    "no_answer_rate",
+    "numeric_temporal_fidelity",
+    "open_gap",
+    "parse_model",
+    "precision_at_k",
+    "ragas_faithfulness",
+    "recall_at_k",
+    "run_judge",
+    "same_provider",
+    "semantic_recovery",
+    "severity_calibration",
+    "source_coverage",
+    "surface_deduplication",
+]
diff --git a/fireflyframework_agentic/evaluation/embedder.py b/fireflyframework_agentic/evaluation/embedder.py
@@ -0,0 +1,82 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Resolve a ``<provider>:<model>`` spec to a framework embedder.
+
+Mirrors flycanon's ``embedding_service._build_embedder``: one branch per
+provider shipped by ``fireflyframework_agentic.embeddings``. Per-provider
+imports are deferred so a spec that never touches a given provider doesn't
+require its SDK to be installed.
+"""
+
+from __future__ import annotations
+
+import os
+
+from fireflyframework_agentic.embeddings.base import BaseEmbedder
+
+
+def build_embedder(spec: str, *, dimensions: int | None = None, batch_size: int = 64) -> BaseEmbedder:
+    """Build a framework embedder from a ``"<provider>:<model>"`` spec.
+
+    Supported providers: openai, azure, cohere, google, mistral, voyage,
+    bedrock, ollama. Raises ``ValueError`` on a malformed spec or unknown
+    provider.
+    """
+    if ":" not in spec:
+        raise ValueError(f"embedder spec must be '<provider>:<model>' (got {spec!r})")
+    provider, _, model = spec.partition(":")
+    p = provider.strip().lower()
+    if p == "openai":
+        from fireflyframework_agentic.embeddings.providers.openai import OpenAIEmbedder  # noqa: PLC0415
+
+        return OpenAIEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p in ("azure", "azure-openai"):
+        from fireflyframework_agentic.embeddings.providers.azure import AzureEmbedder  # noqa: PLC0415
+
+        return AzureEmbedder(
+            model=model,
+            dimensions=dimensions,
+            batch_size=batch_size,
+            azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", ""),
+        )
+    if p == "cohere":
+        from fireflyframework_agentic.embeddings.providers.cohere import CohereEmbedder  # noqa: PLC0415
+
+        return CohereEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p in ("google", "gemini"):
+        from fireflyframework_agentic.embeddings.providers.google import GoogleEmbedder  # noqa: PLC0415
+
+        return GoogleEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p == "mistral":
+        from fireflyframework_agentic.embeddings.providers.mistral import MistralEmbedder  # noqa: PLC0415
+
+        return MistralEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p == "voyage":
+        from fireflyframework_agentic.embeddings.providers.voyage import VoyageEmbedder  # noqa: PLC0415
+
+        return VoyageEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p == "bedrock":
+        from fireflyframework_agentic.embeddings.providers.bedrock import BedrockEmbedder  # noqa: PLC0415
+
+        return BedrockEmbedder(model=model, dimensions=dimensions, batch_size=batch_size)
+    if p == "ollama":
+        from fireflyframework_agentic.embeddings.providers.ollama import OllamaEmbedder  # noqa: PLC0415
+
+        base_url = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
+        return OllamaEmbedder(model=model, dimensions=dimensions, base_url=base_url, batch_size=batch_size)
+    raise ValueError(
+        f"unknown embedding provider {provider!r}; supported: "
+        "openai, azure, cohere, google, mistral, voyage, bedrock, ollama"
+    )