Skip to content

Commit d39c68d

Browse files
authored
De-flake conformance CI: solo re-verification, spawn-storm reduction, result artifacts (#3043)
1 parent 0da9092 commit d39c68d

4 files changed

Lines changed: 144 additions & 6 deletions

File tree

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/bin/bash
2+
# Run a client conformance suite, re-verifying unexpected failures solo.
3+
# Concurrent suite runs on a 2-vCPU runner can push scenarios with real-time
4+
# waits past tolerance; solo, a real failure fails again while a contention
5+
# artifact passes. Failures that only reproduce under concurrency are excused.
6+
set -uo pipefail
7+
8+
: "${CONFORMANCE_PKG:?set CONFORMANCE_PKG (pinned in .github/workflows/conformance.yml)}"
9+
# One attempt: a solo failure on the quiet runner disproves the contention
10+
# hypothesis; a second try would be the blind retry this script avoids.
11+
SOLO_ATTEMPTS="${CONFORMANCE_SOLO_ATTEMPTS:-1}"
12+
13+
# Relative args resolve from the repo root; same contract as run-server.sh.
14+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15+
cd "$SCRIPT_DIR/../../.." || exit 1
16+
17+
log="$(mktemp)"
18+
trap 'rm -f "$log"' EXIT
19+
20+
npx --yes "$CONFORMANCE_PKG" client "$@" 2>&1 | tee "$log"
21+
rc=${PIPESTATUS[0]}
22+
if [ "$rc" -eq 0 ]; then
23+
exit 0
24+
fi
25+
26+
plain="$(sed 's/\x1b\[[0-9;]*m//g' "$log")"
27+
28+
# If the harness's summary wording changes, the list comes up empty and the
29+
# original exit code passes through - never a false green.
30+
mapfile -t scenarios < <(
31+
printf '%s\n' "$plain" |
32+
sed -n '/^Unexpected failures (not in baseline):$/,/^$/p' |
33+
sed -n 's/^ ✗ //p'
34+
)
35+
if [ "${#scenarios[@]}" -eq 0 ]; then
36+
exit "$rc"
37+
fi
38+
for scenario in "${scenarios[@]}"; do
39+
if ! [[ "$scenario" =~ ^[A-Za-z0-9/_-]+$ ]]; then
40+
echo "Extracted unexpected-failure name '${scenario}' does not look like a scenario name; passing the suite failure through." >&2
41+
exit "$rc"
42+
fi
43+
done
44+
45+
# A stale baseline entry is a configuration error a solo rerun cannot excuse.
46+
# Here-string, not a pipe: grep -q quitting early would SIGPIPE printf and,
47+
# under pipefail, skip this guard exactly when the pattern is present.
48+
if grep -q '^Stale baseline entries' <<<"$plain"; then
49+
echo "Suite also reported stale baseline entries; not retrying." >&2
50+
exit "$rc"
51+
fi
52+
53+
# Drop the suite-only flags: --scenario replaces --suite, and solo runs are
54+
# judged directly rather than against the baseline.
55+
rerun_args=()
56+
output_dir=""
57+
skip_next=0
58+
expect_output_dir=0
59+
for arg in "$@"; do
60+
if [ "$skip_next" -eq 1 ]; then
61+
if [ "$expect_output_dir" -eq 1 ]; then
62+
output_dir="$arg"
63+
fi
64+
skip_next=0
65+
expect_output_dir=0
66+
continue
67+
fi
68+
case "$arg" in
69+
--output-dir)
70+
skip_next=1
71+
expect_output_dir=1
72+
;;
73+
--suite | --expected-failures) skip_next=1 ;;
74+
--output-dir=*) output_dir="${arg#--output-dir=}" ;;
75+
--suite=* | --expected-failures=*) ;;
76+
*) rerun_args+=("$arg") ;;
77+
esac
78+
done
79+
if [ -n "$output_dir" ]; then
80+
rerun_args+=(--output-dir "${output_dir}-solo")
81+
fi
82+
83+
for scenario in "${scenarios[@]}"; do
84+
passed=0
85+
for attempt in $(seq 1 "$SOLO_ATTEMPTS"); do
86+
echo ""
87+
echo "Re-running '${scenario}' solo (attempt ${attempt}/${SOLO_ATTEMPTS})..."
88+
if npx --yes "$CONFORMANCE_PKG" client --scenario "$scenario" "${rerun_args[@]}"; then
89+
passed=1
90+
break
91+
fi
92+
done
93+
if [ "$passed" -ne 1 ]; then
94+
echo "'${scenario}' still fails when run alone: real failure, not suite contention." >&2
95+
exit 1
96+
fi
97+
done
98+
99+
if [ -n "$output_dir" ]; then
100+
mkdir -p "$output_dir"
101+
printf '%s\n' "${scenarios[@]}" > "$output_dir/FLAKE_RESCUED"
102+
fi
103+
echo "All ${#scenarios[@]} unexpected failure(s) passed when re-run solo; the suite failures were parallel-run contention."
104+
exit 0

.github/workflows/conformance.yml

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,17 +64,20 @@ jobs:
6464
./.github/actions/conformance/run-server.sh
6565
--suite active
6666
--expected-failures ./.github/actions/conformance/expected-failures.yml
67+
--output-dir conformance-results/server-active
6768
- name: Run server conformance (draft suite)
6869
run: >-
6970
./.github/actions/conformance/run-server.sh
7071
--suite draft
7172
--expected-failures ./.github/actions/conformance/expected-failures.yml
73+
--output-dir conformance-results/server-draft
7274
- name: Run server conformance (2026-07-28 wire, all suite)
7375
run: >-
7476
./.github/actions/conformance/run-server.sh
7577
--suite all
7678
--spec-version 2026-07-28
7779
--expected-failures ./.github/actions/conformance/expected-failures.2026-07-28.yml
80+
--output-dir conformance-results/server-2026-07-28
7881
- name: Run server conformance (all suite, extension scenarios)
7982
# A bare `--suite all` (no --spec-version) selects every scenario
8083
# shipped with the pinned harness — including the extension-tagged
@@ -91,6 +94,15 @@ jobs:
9194
./.github/actions/conformance/run-server.sh
9295
--suite all
9396
--expected-failures ./.github/actions/conformance/expected-failures.yml
97+
--output-dir conformance-results/server-all
98+
- name: Upload conformance results
99+
# The log has only summary counts; per-check data is in checks.json.
100+
if: failure()
101+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
102+
with:
103+
name: server-conformance-results
104+
path: conformance-results/
105+
if-no-files-found: ignore
94106

95107
client-conformance:
96108
runs-on: ubuntu-latest
@@ -118,22 +130,39 @@ jobs:
118130
echo "CONFORMANCE_PKG=file:/tmp/conformance.tgz" >> "$GITHUB_ENV"
119131
;;
120132
esac
121-
- run: uv sync --frozen --all-extras --package mcp
133+
# --compile-bytecode: without it, ~40 concurrently spawned interpreters
134+
# race to byte-compile site-packages during the timing-sensitive window.
135+
- run: uv sync --frozen --all-extras --package mcp --compile-bytecode
136+
- name: Pre-compile bytecode (editable sources)
137+
run: uv run --frozen python -m compileall -q src .github/actions/conformance
122138
- name: Run client conformance (all suite)
123139
# The harness runs all scenarios via unbounded Promise.all; with 40
124140
# scenarios on a 2-core runner the slowest one (sse-retry, which has a
125141
# real-time SSE reconnect wait) needs more than the 30s default budget.
142+
# `.venv/bin/python` (not `uv run`) avoids lockfile re-checks in ~40
143+
# concurrent spawns; run-client.sh re-runs unexpected failures solo.
126144
run: >-
127-
npx --yes "$CONFORMANCE_PKG" client
128-
--command 'uv run --frozen python .github/actions/conformance/client.py'
145+
./.github/actions/conformance/run-client.sh
146+
--command '.venv/bin/python .github/actions/conformance/client.py'
129147
--suite all
130148
--timeout 60000
131149
--expected-failures ./.github/actions/conformance/expected-failures.yml
150+
--output-dir conformance-results/client-all
132151
- name: Run client conformance (2026-07-28 wire, all suite)
133152
run: >-
134-
npx --yes "$CONFORMANCE_PKG" client
135-
--command 'uv run --frozen python .github/actions/conformance/client.py'
153+
./.github/actions/conformance/run-client.sh
154+
--command '.venv/bin/python .github/actions/conformance/client.py'
136155
--suite all
137156
--timeout 60000
138157
--spec-version 2026-07-28
139158
--expected-failures ./.github/actions/conformance/expected-failures.2026-07-28.yml
159+
--output-dir conformance-results/client-2026-07-28
160+
- name: Upload conformance results
161+
# The log has only summary counts; per-check data is in checks.json.
162+
# Also on FLAKE_RESCUED: rescued-flake evidence is otherwise discarded.
163+
if: failure() || hashFiles('conformance-results/**/FLAKE_RESCUED') != ''
164+
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
165+
with:
166+
name: client-conformance-results
167+
path: conformance-results/
168+
if-no-files-found: ignore

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,3 +173,6 @@ cython_debug/
173173

174174
# claude code
175175
results/
176+
177+
# conformance CI local runs
178+
conformance-results/

examples/servers/everything-server/mcp_everything_server/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,12 @@ async def test_tool_with_progress(ctx: Context) -> str:
192192
async def test_sampling(prompt: str, ctx: Context) -> str:
193193
"""Tests server-initiated sampling (LLM completion request)"""
194194
try:
195-
# Request sampling from client
195+
# Request sampling from client. Without related_request_id the request goes
196+
# to the standalone GET stream and is silently dropped if it is not open yet.
196197
result = await ctx.session.create_message( # pyright: ignore[reportDeprecated]
197198
messages=[SamplingMessage(role="user", content=TextContent(type="text", text=prompt))],
198199
max_tokens=100,
200+
related_request_id=ctx.request_id,
199201
)
200202

201203
# Since we're not passing tools param, result.content is single content

0 commit comments

Comments
 (0)