Merge pull request #2 from eval-protocol/restore-ci-workflows

benjibc · web-flow · commit bacce91cbe90 · 2025-07-31T23:59:20.000-07:00
Restore CI workflow files
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,187 @@
+name: Python CI
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  push:
+    branches: [main]
+    paths-ignore:
+      - "docs/**"
+      - "*.md"
+  pull_request:
+    branches: [main]
+    paths-ignore:
+      - "docs/**"
+      - "*.md"
+  workflow_dispatch:
+
+jobs:
+  lint-and-type-check:
+    name: Lint & Type Check
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for all tags and branches
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Lint with flake8
+        run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
+
+      - name: Type check with mypy
+        run: uv run mypy eval_protocol
+
+  test-core:
+    name: Core Tests (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    needs: lint-and-type-check
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for all tags and branches
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Run Core Tests with pytest-xdist
+        env:
+          E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
+          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+          FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
+        run: |
+          # Run most tests in parallel, but explicitly ignore tests that manage their own servers
+          uv run pytest \
+            -n auto \
+            --ignore=tests/test_batch_evaluation.py \
+            --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
+
+      - name: Store coverage file
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-core-${{ matrix.python-version }}
+          path: coverage.xml
+          retention-days: 1
+
+  test-batch-evaluation:
+    name: Batch Evaluation Tests
+    runs-on: ubuntu-latest
+    needs: lint-and-type-check
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for all tags and branches
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Run Batch Evaluation Tests
+        env:
+          E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
+          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+          FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
+        run: |
+          # Run only this specific test file, WITHOUT xdist
+          uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
+      - name: Store coverage file
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-batch-eval
+          path: coverage.xml
+          retention-days: 1
+
+  test-mcp-e2e:
+    name: MCP End-to-End Tests
+    runs-on: ubuntu-latest
+    needs: lint-and-type-check
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # Fetch all history for all tags and branches
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Store coverage file
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-mcp-e2e
+          path: coverage.xml
+          retention-days: 1
+
+  upload-coverage:
+    name: Upload Coverage
+    runs-on: ubuntu-latest
+    needs: [test-core, test-batch-evaluation, test-mcp-e2e]
+    steps:
+      - name: Download all coverage artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: coverage-artifacts
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          directory: ./coverage-artifacts/
+          fail_ci_if_error: false
+          verbose: true
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,70 @@
+name: Python Package Release
+
+on:
+  push:
+    tags:
+      - 'v[0-9]+.[0-9]+.[0-9]+*' # Trigger on version tags like v1.2.3, v1.2.3-alpha
+
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write # Needed to create GitHub releases
+      id-token: write # Needed for PyPI trusted publishing
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+
+    - name: Build package
+      run: python -m build
+
+    - name: Create GitHub Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        tag_name: ${{ github.ref_name }}
+        release_name: Release ${{ github.ref_name }}
+        body: |
+          Changes in this release:
+          - TODO: Add release notes here or link to CHANGELOG.md
+        draft: false
+        prerelease: ${{ contains(github.ref_name, '-') }} # Mark as prerelease if tag contains '-' (e.g., v1.0.0-alpha)
+
+    - name: Publish package to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      # with:
+      #   user: __token__
+      #   password: ${{ secrets.PYPI_API_TOKEN }} # Requires a PYPI_API_TOKEN secret in repository
+
+    # If using trusted publishing (recommended), the above `with` block for user/password is not needed.
+    # Ensure PyPI project settings are configured for trusted publishing from this GitHub repository and workflow.
+
+    - name: Upload release assets (package files) to GitHub Release
+      uses: softprops/action-gh-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        files: ./dist/*
+
+    # Consider adding another asset upload for the .whl file if desired
+    # - name: Upload Wheel to GitHub Release
+    #   uses: actions/upload-release-asset@v1
+    #   env:
+    #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    #   with:
+    #     upload_url: ${{ steps.create_release.outputs.upload_url }}
+    #     asset_path: ./dist/*.whl
+    #     asset_name: ${{ github.event.repository.name }}-${{ github.ref_name }}.whl
+    #     asset_content_type: application/wheel