braintrustdata · Abhijeet Prasad (AbhiPrasad) · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.github/workflows/enforce-pnpm.yml b/.github/workflows/enforce-pnpm.yml
@@ -3,6 +3,7 @@ name: Enforce pnpm
 on:
   pull_request:
   push:
+    branches: [main]
 
 jobs:
   reject-npm-lockfile:

diff --git a/.github/workflows/eval.yaml b/.github/workflows/eval.yaml
@@ -1,10 +1,15 @@
 name: Run pnpm evals
 
 on:
+  pull_request:
+    # Uncomment to run only when files in the 'evals' directory change
+    # paths:
+    #   - "evals/**"
   push:
+    branches: [main]
     # Uncomment to run only when files in the 'evals' directory change
-    # - paths:
-    #     - "evals/**"
+    # paths:
+    #   - "evals/**"
 
 permissions:
   pull-requests: write
@@ -18,7 +23,7 @@ jobs:
     steps:
       - name: Checkout
         id: checkout
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           fetch-depth: 0
 

diff --git a/.github/workflows/js.yaml b/.github/workflows/js.yaml
@@ -11,28 +11,19 @@ jobs:
 
     strategy:
       matrix:
-        # duckdb has an incredibly slow install with 24.x
-        node-version: [20.x, 22.x]
+        node-version: [20.x, 22.x, 24.x]
 
     steps:
-      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
-      - name: Cache node_modules
-        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
-        with:
-          path: |
-            node_modules
-            !node_modules/.cache/turbo
-          key: ${{ matrix.runner }}-${{ matrix.node_version }}-node-${{ env.nodeModulesCacheHash }}
-          restore-keys: |
-            ${{ matrix.runner }}-${{ matrix.node_version }}-node-
-      - name: Use Node.js ${{ matrix.node-version }}
-        uses: actions/setup-node@3235b876344d2a9aa001b8d1453c930bba69e610 # v3.9.1
-        with:
-          node-version: ${{ matrix.node-version }}
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       - name: Setup pnpm
         uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0
         with:
           version: 10.33.0
+      - name: Use Node.js ${{ matrix.node-version }}
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: pnpm
       - run: pnpm install --frozen-lockfile
       - run: pnpm run test
         env:

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -9,10 +9,22 @@ on:
 jobs:
   lint:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     steps:
-      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
-      - uses: actions/setup-python@3542bca2639a428e1796aaa6a2ffef0c0f575566 # v3.1.4
-      - name: Install pre-commit
-        run: python -m pip install pre-commit
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: "3.12"
+      - name: Cache pre-commit
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}
+      - name: Set up uv
+        uses: astral-sh/setup-uv@94527f2e458b27549849d47d273a16bec83a01e9 # v7
+        with:
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --extra dev
       - name: Run pre-commit
-        run: pre-commit run --all-files
+        run: uv run pre-commit run --all-files
diff --git a/.github/workflows/publish-js.yaml b/.github/workflows/publish-js.yaml
@@ -37,7 +37,7 @@ jobs:
       commit: ${{ steps.release_metadata.outputs.commit }}
       release_type: ${{ steps.release_metadata.outputs.release_type }}
     steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           fetch-depth: 1
           ref: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
       TARGET_BRANCH: ${{ needs.prepare-release.outputs.branch }}
       RELEASE_COMMIT: ${{ needs.prepare-release.outputs.commit }}
     steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           fetch-depth: 0
           ref: ${{ needs.prepare-release.outputs.branch }}

diff --git a/.github/workflows/publish-py.yaml b/.github/workflows/publish-py.yaml
@@ -37,7 +37,7 @@ jobs:
       commit: ${{ steps.release_metadata.outputs.commit }}
       release_type: ${{ steps.release_metadata.outputs.release_type }}
     steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           fetch-depth: 1
           ref: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
       TARGET_BRANCH: ${{ needs.prepare-release.outputs.branch }}
       RELEASE_COMMIT: ${{ needs.prepare-release.outputs.commit }}
     steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           fetch-depth: 0
           ref: ${{ needs.prepare-release.outputs.branch }}
@@ -143,8 +143,13 @@ jobs:
               raise SystemExit(1)
           PY
 
+      - name: Set up uv
+        uses: astral-sh/setup-uv@94527f2e458b27549849d47d273a16bec83a01e9 # v7
+        with:
+          enable-cache: true
+
       - name: Install build dependencies
-        run: python -m pip install --upgrade pip build twine
+        run: uv sync --extra dev
 
       - name: Prepare prerelease package metadata
         if: ${{ env.RELEASE_TYPE == 'prerelease' }}
@@ -171,13 +176,13 @@ jobs:
           PY
 
       - name: Build package
-        run: python -m build
+        run: uv run --extra dev python -m build
 
       - name: Verify package metadata
-        run: python -m twine check dist/*
+        run: uv run --extra dev python -m twine check dist/*
 
       - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
         with:
           packages-dir: dist/
 

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -8,24 +8,32 @@ on:
 
 jobs:
   build:
+    name: Python ${{ matrix.python-version }}
     runs-on: ubuntu-latest
+    timeout-minutes: 20
     strategy:
+      fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
-      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@3542bca2639a428e1796aaa6a2ffef0c0f575566 # v3.1.4
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Set up uv
+        uses: astral-sh/setup-uv@94527f2e458b27549849d47d273a16bec83a01e9 # v7
+        with:
+          enable-cache: true
       - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip setuptools build twine openai
-          python -m pip install -e .[all]
+        run: uv sync --all-extras
+      - name: Build package
+        run: uv run --all-extras python -m build
+      - name: Check package metadata
+        run: uv run --all-extras python -m twine check dist/*
       - name: Test with pytest
-        run: |
-          pytest
+        run: uv run --all-extras pytest
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}

diff --git a/.github/workflows/version-sync.yaml b/.github/workflows/version-sync.yaml
@@ -10,6 +10,6 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 5
     steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       - name: Check JS and Python package versions match
         run: python3 .github/scripts/check_version_sync.py
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,107 @@
+# AGENTS.md
+
+This file provides guidance to coding agents when working with code in this repository. `AGENTS.md` is the source of truth; `CLAUDE.md` is a symlink for compatibility.
+
+## Project Overview
+
+Autoevals is a dual-language library (TypeScript + Python) for evaluating AI model outputs. It provides LLM-as-a-judge evaluations, heuristic scorers (Levenshtein distance), and statistical metrics (BLEU). Developed by Braintrust.
+
+## Commands
+
+### TypeScript (in root directory)
+
+```bash
+pnpm install --frozen-lockfile          # Install dependencies
+pnpm run build                          # Build JS (outputs to jsdist/)
+pnpm run test                           # Run all JS tests with vitest
+pnpm run test -- js/llm.test.ts                    # Run single test file
+pnpm run test -- -t "test name"                    # Run specific test by name
+```
+
+### Python (from root directory)
+
+Python dependency management uses `uv` and the project metadata in `pyproject.toml`/`uv.lock`.
+
+```bash
+make develop                         # Set up .venv with dev + scipy extras and install pre-commit
+source env.sh                        # Activate the .venv
+uv sync --extra dev --extra scipy    # Sync local dev dependencies
+uv sync --all-extras                 # Sync all optional dependency groups (CI-style)
+uv run --extra dev --extra scipy pytest                              # Run Python tests
+uv run --extra dev --extra scipy pytest py/autoevals/test_llm.py      # Run single test file
+uv run --extra dev --extra scipy pytest -k "test_name"               # Run tests matching pattern
+uv run --all-extras python -m build                                   # Build Python package
+uv run --all-extras python -m twine check dist/*                      # Check package metadata
+```
+
+### Linting
+
+```bash
+uv run --extra dev pre-commit run --all-files    # Run all linters (black, ruff, prettier, codespell)
+pre-commit run --all-files                       # Also works after make develop/source env.sh
+make fixup                                       # Same as above
+```
+
+## Architecture
+
+### Dual Implementation Pattern
+
+The library maintains parallel implementations in TypeScript (`js/`) and Python (`py/autoevals/`). Both share:
+
+- The same evaluation templates (`templates/*.yaml`)
+- The same `Score` interface: `{name, score (0-1), metadata}`
+- The same scorer names and behavior
+
+### Key Modules (both languages)
+
+- `llm.ts` / `llm.py` - LLM-as-a-judge scorers (Factuality, Battle, ClosedQA, Humor, Security, Sql, Summary, Translation)
+- `ragas.ts` / `ragas.py` - RAG evaluation metrics (ContextRelevancy, Faithfulness, AnswerRelevancy, etc.)
+- `string.ts` / `string.py` - Text similarity (Levenshtein, EmbeddingSimilarity)
+- `json.ts` / `json.py` - JSON validation and diff
+- `oai.ts` / `oai.py` - OpenAI client wrapper with caching
+- `score.ts` / `score.py` - Core Score type and Scorer base class
+
+### Template System
+
+YAML templates in `templates/` define LLM classifier prompts. Templates use Mustache syntax (`{{variable}}`). The `LLMClassifier` class loads these templates and handles:
+
+- Prompt rendering with chain-of-thought (CoT) suffix
+- Tool-based response parsing via `select_choice` function
+- Score mapping from choice letters to numeric scores
+
+### Python Scorer Pattern
+
+```python
+class Scorer(ABC):
+    def eval(self, output, expected=None, **kwargs) -> Score      # Sync
+    async def eval_async(self, output, expected=None, **kwargs)   # Async
+    def __call__(...)  # Alias for eval()
+```
+
+### TypeScript Scorer Pattern
+
+```typescript
+type Scorer<Output, Extra> = (
+  args: ScorerArgs<Output, Extra>,
+) => Score | Promise<Score>;
+// All scorers are async functions
+```
+
+## CI and Releases
+
+- Publishing is handled by trusted publishing workflows documented in `docs/PUBLISHING.md`.
+- JavaScript and Python package versions must stay in sync between `package.json` and `py/autoevals/version.py`; CI enforces this via `.github/workflows/version-sync.yaml` and `.github/scripts/check_version_sync.py`.
+
+## Environment Variables
+
+Tests require:
+
+- `OPENAI_API_KEY` or `BRAINTRUST_API_KEY` - For LLM-based evaluations
+- `OPENAI_BASE_URL` (optional) - Custom API endpoint
+
+## Testing Notes
+
+- Python tests use `pytest` with `respx` for HTTP mocking
+- TypeScript tests use `vitest` with `msw` for HTTP mocking
+- Tests that call real LLM APIs need valid API keys
+- Test files are colocated: `test_*.py` (Python), `*.test.ts` (TypeScript)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ name: Enforce pnpm @@
     on:
       pull_request:
       push:
+        branches: [main]
     jobs:
       reject-npm-lockfile:
@@ Expand Down @@