From 8242762a59ca108ad03f10ac46ae3cd9139acc1c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 00:35:40 -0700
Subject: [PATCH 01/16] gb300 1k1k sglang

---
 .github/configs/nvidia-master.yaml            | 33 +++++++
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 99 +++++++++++++++++++
 perf-changelog.yaml                           |  9 ++
 runners/launch_gb300-nv.sh                    | 18 +++-
 4 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..b58f27aee 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7666,3 +7666,36 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-sglang:
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single
+  # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe
+  # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/
+  # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-nv.sh.
+  # DEP/TEP variants are upstream follow-ups; mirror that and ship 1P1D
+  # only here.
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [1, 4, 16, 64, 256]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
new file mode 100644
index 000000000..307298449
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -0,0 +1,99 @@
+name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
+
+# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang +
+# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75
+# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of
+# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree.
+#
+# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
+# (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under
+# moderate-to-high concurrency.
+#
+# Local deltas vs upstream PR #75:
+#   * benchmark.type = sa-bench (upstream uses "manual" because they pair
+#     with a separate sa-bench launcher; our sweep harness drives sa-bench
+#     in-recipe).
+#   * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the
+#     GB200 sglang sibling — same handshake-stability rationale.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x16x64x256"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7ed3c16ff..1807d37d2 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1833,3 +1833,12 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell"
+    - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer"
+    - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 5f48ddcec..ba888c10a 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -18,8 +18,15 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export SERVED_MODEL_NAME="deepseek-r1-fp8"
     export MODEL_PATH=/raid/shared/models/deepseek-r1-0528
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
+elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSv4
+    # sglang recipes (benchmarks/multi_node/srt-slurm-recipes/sglang/
+    # deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml).
+    export SERVED_MODEL_NAME="deepseek-v4-pro"
+    export MODEL_PATH=/raid/shared/models/deepseek-v4-pro
+    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
 else
-    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8"
+    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4"
     exit 1
 fi
 
@@ -47,6 +54,15 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q2-2026
 
+# Overlay our hand-rolled DSv4 sglang recipes on top of the upstream tree.
+# NVIDIA/srt-slurm has no upstream sglang DSv4 disagg recipe for GB300
+# beyond PR #75's 1P1D-TP4 entry, so we ship the recipe locally and copy
+# it in here. Mirrors the equivalent block in launch_gb200-nv.sh.
+if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then
+    mkdir -p recipes/sglang/deepseek-v4
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
+fi
+
 echo "Installing srtctl..."
 export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
 curl -LsSf https://astral.sh/uv/install.sh | sh

From ba062c0f94277b89d6a0cea1b4999f7944218103 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 09:54:21 -0700
Subject: [PATCH 02/16] route gb300 sglang to cw cluster

---
 .github/configs/nvidia-master.yaml            |   9 +-
 .github/configs/runners.yaml                  |   5 +
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           |  16 ++
 runners/launch_gb300-cw.sh                    | 267 ++++++++++++++++++
 runners/launch_gb300-nv.sh                    |  18 +-
 5 files changed, 294 insertions(+), 21 deletions(-)
 create mode 100755 runners/launch_gb300-cw.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b58f27aee..338db42b2 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7671,7 +7671,7 @@ dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:deepseek-v4-grace-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: gb300
+  runner: gb300-cw
   precision: fp4
   framework: dynamo-sglang
   multinode: true
@@ -7679,9 +7679,10 @@ dsv4-fp4-gb300-dynamo-sglang:
   # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single
   # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe
   # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/
-  # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-nv.sh.
-  # DEP/TEP variants are upstream follow-ups; mirror that and ship 1P1D
-  # only here.
+  # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-cw.sh.
+  # Cluster gb300-cw is CoreWeave (2x 18-node racks); recipe sets its
+  # own sbatch_directives.segment for rack pinning. DEP/TEP variants
+  # are upstream follow-ups; mirror that and ship 1P1D only here.
   seq-len-configs:
   - isl: 1024
     osl: 1024
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 693bb4561..4ce8d2fcb 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -131,3 +131,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 307298449..68e96edeb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -5,6 +5,12 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
 # (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of
 # the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree.
 #
+# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at
+# 1P1D-TP4 fits trivially within a single rack; the explicit segment
+# below pins them so the NIXL KV transfer between prefill and decode
+# stays rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto
+# segment so each recipe owns its own value.)
+#
 # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
 # (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under
 # moderate-to-high concurrency.
@@ -15,6 +21,8 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
 #     in-recipe).
 #   * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the
 #     GB200 sglang sibling — same handshake-stability rationale.
+#   * sbatch_directives.segment + mem: rack-pinning for cw, mirroring the
+#     dynamo-vllm gb300 recipe convention.
 
 model:
   path: "deepseek-v4-pro"
@@ -24,6 +32,14 @@ model:
 dynamo:
   version: 0.8.1
 
+# Pin both nodes (1P + 1D) to the same rack on cw. Without this they
+# can land on different racks and pay the cross-rack hop on every NIXL
+# KV transfer.
+sbatch_directives:
+  segment: "2"
+  # Use all node memory; cw default is too tight for the MXFP4 worker.
+  mem: "0"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
new file mode 100755
index 000000000..582a2a342
--- /dev/null
+++ b/runners/launch_gb300-cw.sh
@@ -0,0 +1,267 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw
+# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in
+# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). Compared to
+# that script, the SGLang flow is simpler: no dynamo wheel prebuild and
+# no vllm-container-deps.sh override, because the SGLang recipes pin
+# `dynamo.version: 0.8.1` and srtctl pip-installs from PyPI per rank.
+
+set -x
+
+if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local
+    # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in
+    # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/.
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
+    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; account `cw-sup` is
+# what `sacctmgr show assoc user=$USER` returns there. `benchmark`
+# (inherited from gb200-nv) does not exist on cw.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="cw-sup"
+
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env
+# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so)
+# to mount into the container. cw doesn't set them by default — without
+# them the container has no libcuda and CUDA init fails. SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+NGINX_IMAGE="nginx:1.27.4"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+SQUASH_DIR="/mnt/vast/squash"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+enroot import -o $SQUASH_FILE docker://$IMAGE
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits
+# hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+git checkout sa-submission-q2-2026
+
+# Overlay our hand-rolled DSv4 SGLang recipes. NVIDIA/srt-slurm has no
+# upstream sglang DSv4 disagg recipe yet beyond PR #75's 1P1D-TP4
+# entry, so we ship the recipe locally and copy it in here. `cp -rT`
+# overlays onto a possibly-existing upstream stub instead of nesting.
+mkdir -p recipes/sglang/deepseek-v4
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
+
+echo "Installing srtctl..."
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
+# shared NFS across both. srtctl's slurm template (job_script_minimal.j2)
+# does `if ! command -v uv` and skips its own ARM64 install when uv is
+# already on PATH; on compute nodes $HOME/.local/bin is on PATH by
+# default, so a stray x86 binary at $HOME/.local/bin/uv from this
+# runner shadows the template's install and crashes the orchestrator
+# with `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CW (SGLang)
+
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "6:00:00"
+
+gpus_per_node: 4
+network_interface: ""
+
+srtctl_root: "${SRTCTL_ROOT}"
+
+model_paths:
+  "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# Auto-emission of #SBATCH --segment={total_nodes} is turned off here
+# because each gb300 recipe sets its own segment via sbatch_directives
+# (rack-pinning on cw's 2x18-node racks).
+use_segment_sbatch_directive: false
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Override the job name in the config file with the runner name
+sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index ba888c10a..5f48ddcec 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -18,15 +18,8 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export SERVED_MODEL_NAME="deepseek-r1-fp8"
     export MODEL_PATH=/raid/shared/models/deepseek-r1-0528
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
-elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-    # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSv4
-    # sglang recipes (benchmarks/multi_node/srt-slurm-recipes/sglang/
-    # deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml).
-    export SERVED_MODEL_NAME="deepseek-v4-pro"
-    export MODEL_PATH=/raid/shared/models/deepseek-v4-pro
-    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
 else
-    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4"
+    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8"
     exit 1
 fi
 
@@ -54,15 +47,6 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout sa-submission-q2-2026
 
-# Overlay our hand-rolled DSv4 sglang recipes on top of the upstream tree.
-# NVIDIA/srt-slurm has no upstream sglang DSv4 disagg recipe for GB300
-# beyond PR #75's 1P1D-TP4 entry, so we ship the recipe locally and copy
-# it in here. Mirrors the equivalent block in launch_gb200-nv.sh.
-if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then
-    mkdir -p recipes/sglang/deepseek-v4
-    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
-fi
-
 echo "Installing srtctl..."
 export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
 curl -LsSf https://astral.sh/uv/install.sh | sh

From 79039709ac4d8964e2fe49b476f67218f4f4aa37 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 10:21:04 -0700
Subject: [PATCH 03/16] connector

---
 .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 68e96edeb..c563ef45a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -63,7 +63,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"

From 26943f799e74e29f2ec1959b005e5072afaf4087 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 10:56:23 -0700
Subject: [PATCH 04/16] path

---
 runners/launch_gb300-cw.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 582a2a342..82cb8d35e 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -43,6 +43,14 @@ mkdir -p "$SQUASH_DIR"
 SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
+# Some images were imported with '+' separators (enroot's default) rather
+# than '_'. Check for the '+' variant and symlink so both names resolve.
+SQUASH_FILE_PLUS="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh"
+if [ ! -f "$SQUASH_FILE" ] && [ -f "$SQUASH_FILE_PLUS" ]; then
+    ln -sf "$SQUASH_FILE_PLUS" "$SQUASH_FILE"
+    echo "[squash] symlinked $SQUASH_FILE -> $SQUASH_FILE_PLUS"
+fi
+
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 

From e7b58f72bd4f4f5e37f4b18fbb47ec1f35120bf5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 13:41:20 -0700
Subject: [PATCH 05/16] =?UTF-8?q?drop=20forced=20dynamo=200.8.1=20install?=
 =?UTF-8?q?=20=E2=80=94=20use=20container-bundled=20dynamo=20for=20DSv4=20?=
 =?UTF-8?q?formatter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml      | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index c563ef45a..d268ec7b6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -29,8 +29,16 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# NB: no `dynamo:` block. Mirroring upstream PR #75: rely on whatever
+# dynamo the lmsysorg/sglang:deepseek-v4-grace-blackwell container ships.
+# That container's bundled dynamo has the native DeepSeekV4Formatter (added
+# at hash 6a159fed, 2026-04-23 — see comment in the gb200 vllm sibling
+# disagg-gb200-7p1d-dep8-dep16.yaml) which auto-detects DSv4 by model name
+# and serves /v1/completions without needing chat_template in
+# tokenizer_config.json. Forcing dynamo.version=0.8.1 made srtctl pip-install
+# an older release on top of the container that *did* require chat_template,
+# and the frontend then 404'd: PromptFormatter.from_mdc rejected the model
+# at pipeline build time. Run #24963242956 was the casualty.
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
 # can land on different racks and pay the cross-rack hop on every NIXL

From fa52ab060f35edb2aef3aa8783127350f7c830d0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 14:44:01 -0700
Subject: [PATCH 06/16] match upstream PR #75 tunings + skip srtctl dynamo
 install

---
 .github/configs/nvidia-master.yaml            |  2 +-
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 88 ++++++++++++-------
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 56c23b5d4..6778a25d3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7710,7 +7710,7 @@ dsv4-fp4-gb300-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - conc-list: [1, 4, 16, 64, 256]
+    - conc-list: [4, 8, 16, 32, 64, 128]
       prefill:
         num-worker: 1
         tp: 4
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index d268ec7b6..516a14169 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -12,33 +12,53 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
 # segment so each recipe owns its own value.)
 #
 # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
-# (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under
-# moderate-to-high concurrency.
+# (4 GPUs / node). KV transfer over NIXL. PR #75 measures saturation
+# at conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
+#
+# ⚠️  NIXL state-buffer-transfer accuracy bug (upstream PR #75 body):
+# the SGLang NIXL backend currently registers and transfers the KV cache
+# correctly but DROPS the model's auxiliary state buffer (SWA / NSA /
+# Mamba). On DSv4-Pro this collapses GSM8K from 1.000 (agg) to ~0.13
+# (disagg) while throughput numbers and KV byte hashes look healthy.
+# Mooncake handles state buffers correctly; the NIXL fix mirrors that
+# (~237 lines extending KVArgsRegisterInfo/TransferInfo/register_buffer_
+# to_engine + adding send_state in
+# python/sglang/srt/disaggregation/nixl/conn.py). Until the upstream
+# sglang fix lands, the patch must be picked up via the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell container build. If
+# eval-only GSM8K runs come back near 0.13 with healthy throughput,
+# that's the cause — not a tuning issue.
 #
 # Local deltas vs upstream PR #75:
-#   * benchmark.type = sa-bench (upstream uses "manual" because they pair
-#     with a separate sa-bench launcher; our sweep harness drives sa-bench
-#     in-recipe).
-#   * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the
-#     GB200 sglang sibling — same handshake-stability rationale.
-#   * sbatch_directives.segment + mem: rack-pinning for cw, mirroring the
-#     dynamo-vllm gb300 recipe convention.
+#   * benchmark.type = sa-bench (upstream also uses sa-bench in the
+#     latest revision; matches).
+#   * sbatch_directives.segment + mem: rack-pinning for cw, mirroring
+#     the dynamo-vllm gb300 recipe convention. Upstream targets a
+#     different cluster and doesn't need this.
 
 model:
   path: "deepseek-v4-pro"
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# NB: no `dynamo:` block. Mirroring upstream PR #75: rely on whatever
-# dynamo the lmsysorg/sglang:deepseek-v4-grace-blackwell container ships.
-# That container's bundled dynamo has the native DeepSeekV4Formatter (added
-# at hash 6a159fed, 2026-04-23 — see comment in the gb200 vllm sibling
-# disagg-gb200-7p1d-dep8-dep16.yaml) which auto-detects DSv4 by model name
-# and serves /v1/completions without needing chat_template in
-# tokenizer_config.json. Forcing dynamo.version=0.8.1 made srtctl pip-install
-# an older release on top of the container that *did* require chat_template,
-# and the frontend then 404'd: PromptFormatter.from_mdc rejected the model
-# at pipeline build time. Run #24963242956 was the casualty.
+# Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4-
+# grace-blackwell image ships a dynamo build with the native Rust
+# DeepSeekV4Formatter (added at hash 6a159fed, 2026-04-23 — see comment
+# in the gb200 vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). That
+# formatter auto-detects DSv4 by model name and serves /v1/completions
+# without needing chat_template in tokenizer_config.json.
+#
+# `install: false` is critical here — without it, srtctl's schema
+# default (install: True, version: "0.8.0", see srt-slurm/src/srtctl/
+# core/schema.py:697) pip-installs ai-dynamo==0.8.0 from PyPI on top
+# of the container, which predates the DSv4 formatter. The frontend
+# then 404s on every request: PromptFormatter.from_mdc rejects the
+# model at pipeline build time with "chat_template field is required
+# in the tokenizer_config.json file". Casualties: runs #24963242956
+# (had dynamo.version: 0.8.1 explicit) and the follow-up (no dynamo
+# block, fell through to the 0.8.0 default).
+dynamo:
+  install: false
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
 # can land on different racks and pay the cross-rack hop on every NIXL
@@ -73,22 +93,10 @@ backend:
   type: sglang
 
   prefill_environment:
-    PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
 
   decode_environment:
-    PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
 
   sglang_config:
     prefill:
@@ -99,8 +107,12 @@ backend:
       disaggregation-mode: "prefill"
       disaggregation-transfer-backend: nixl
       moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
@@ -110,13 +122,21 @@ backend:
       disaggregation-mode: "decode"
       disaggregation-transfer-backend: nixl
       moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1x4x16x64x256"
+  random_range_ratio: 0.8
+  # Low-latency band only — TP4 1P1D saturates near conc=128 on GB300
+  # (PR #75 verified: 838 Total TPS/GPU at conc=128). For high-conc
+  # Pareto use the DEP variants (not in this PR).
+  concurrencies: "4x8x16x32x64x128"
   req_rate: "inf"
   use_chat_template: false

From bc80a16b773522746d8621c662f1bdb9d6ef8f04 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sun, 26 Apr 2026 15:05:51 -0700
Subject: [PATCH 07/16] add flags

---
 .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 516a14169..934ee1d75 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -94,9 +94,19 @@ backend:
 
   prefill_environment:
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
 
   decode_environment:
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
 
   sglang_config:
     prefill:

From 7f431858a2601fff0648a4f3b2a6a4679356290e Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sun, 26 Apr 2026 15:26:24 -0700
Subject: [PATCH 08/16] add more selection space

---
 .github/configs/nvidia-master.yaml            |  38 ++++-
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml      | 141 ++++++++++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           |   9 +-
 3 files changed, 176 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6778a25d3..3e62175d5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7699,18 +7699,27 @@ dsv4-fp4-gb300-dynamo-sglang:
   framework: dynamo-sglang
   multinode: true
   disagg: true
-  # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single
-  # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe
-  # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/
-  # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-cw.sh.
-  # Cluster gb300-cw is CoreWeave (2x 18-node racks); recipe sets its
-  # own sbatch_directives.segment for rack pinning. DEP/TEP variants
-  # are upstream follow-ups; mirror that and ship 1P1D only here.
+  # 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL
+  # KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm-
+  # recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm
+  # checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave
+  # (2x 18-node racks); recipes set their own sbatch_directives.segment
+  # for rack pinning.
+  #
+  # Two search-space bands:
+  #   * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give
+  #     single-user latency floor; 4-128 covers the saturation curve
+  #     mirroring NVIDIA/srt-slurm PR #75.
+  #   * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn +
+  #     DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head
+  #     comparison (find the crossover where DPA beats TP-only); 256-
+  #     1024 extends past the symmetric saturation point (~conc=128 /
+  #     838 Total TPS/GPU per PR #75).
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - conc-list: [4, 8, 16, 32, 64, 128]
+    - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
       prefill:
         num-worker: 1
         tp: 4
@@ -7723,3 +7732,16 @@ dsv4-fp4-gb300-dynamo-sglang:
         tp: 4
         ep: 1
         dp-attn: false
+    - conc-list: [16, 32, 64, 128, 256, 512, 1024]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
new file mode 100644
index 000000000..c79cebc4c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -0,0 +1,141 @@
+name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
+
+# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology:
+#   Prefill: 1 node, TP=4 (no DP-attn, no EP).
+#   Decode:  1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit).
+# Both on a single GB300 (4 GPUs / node). KV transfer over NIXL.
+#
+# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream
+# PR #75 exactly; this DEP4 variant is a local extension to probe whether
+# decode-side DP-attn + DeepEP unlocks throughput past the symmetric
+# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75).
+#
+# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks
+# see DP=4 replication. SGLang's --disaggregation-decode-tp and
+# --disaggregation-decode-dp flags on the prefill engine carry this
+# metadata so KV chunks route to the correct decode rank during NIXL
+# transfer (server_args.py:643-654, validate_disagg_tp_size).
+#
+# Same NIXL state-buffer-transfer caveat as the TP4 sibling - see
+# disagg-gb300-1p1d-tp4.yaml header. The grace-blackwell image build
+# carries the patch.
+#
+# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same
+# way as the symmetric sibling.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# Use the container-bundled dynamo (skip srtctl pip install). Same
+# rationale as the TP4 sibling - see its header for the casualty list.
+dynamo:
+  install: false
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    # MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn +
+    # EP is enabled. Mirror gen_launch.py medium/large defaults.
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so
+      # KV chunks route to the correct decode rank during NIXL transfer.
+      disaggregation-decode-tp: 4
+      disaggregation-decode-dp: 4
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      # Full DP-attn on 4 GPUs: each rank is its own DP unit for
+      # attention; MoE is sharded across EP (ep_size = tp_size = 4
+      # implicit when --moe-a2a-backend deepep).
+      enable-dp-attention: true
+      dp-size: 4
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  # Conc 16-128 overlaps the TP4 sibling for head-to-head comparison
+  # (where does decode-side DPA start beating TP-only?); 256-1024
+  # probes throughput past the symmetric saturation point.
+  concurrencies: "16x32x64x128x256x512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 934ee1d75..86319edc0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -144,9 +144,10 @@ benchmark:
   isl: 1024
   osl: 1024
   random_range_ratio: 0.8
-  # Low-latency band only — TP4 1P1D saturates near conc=128 on GB300
-  # (PR #75 verified: 838 Total TPS/GPU at conc=128). For high-conc
-  # Pareto use the DEP variants (not in this PR).
-  concurrencies: "4x8x16x32x64x128"
+  # Low-latency band — TP4 1P1D saturates near conc=128 on GB300
+  # (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give
+  # single-user latency floor reference; 4-128 covers the saturation
+  # curve. For high-conc Pareto use the DEP variants.
+  concurrencies: "1x2x4x8x16x32x64x128"
   req_rate: "inf"
   use_chat_template: false

From afca046a207c10117b732e1856e330e802fbcfec Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 17:20:30 -0700
Subject: [PATCH 09/16] use _arm64 image tag + squash_dupe dir for gb300-cw

---
 .github/configs/nvidia-master.yaml                  |  6 +++++-
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml            |  2 +-
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml     |  2 +-
 perf-changelog.yaml                                 |  2 +-
 runners/launch_gb300-cw.sh                          | 13 ++++---------
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3e62175d5..2f0e63f53 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7691,7 +7691,11 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: true
 
 dsv4-fp4-gb300-dynamo-sglang:
-  image: lmsysorg/sglang:deepseek-v4-grace-blackwell
+  # _arm64 variant: GH runner pod doing `enroot import` is amd64, but
+  # gb300-cw compute nodes are aarch64 (Grace). Without the explicit
+  # arm64 tag the registry serves the amd64 manifest, which fails to
+  # exec on the compute side.
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-cw
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
index c79cebc4c..1b95cd936 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -25,7 +25,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
 # Use the container-bundled dynamo (skip srtctl pip install). Same
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 86319edc0..c35fe4ec0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -38,7 +38,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
 # Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4-
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index fb50c6f28..b3855391b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1891,7 +1891,7 @@
     - dsv4-fp4-gb300-dynamo-sglang
   description:
     - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration"
-    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell"
+    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)"
     - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer"
     - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 82cb8d35e..cfa9ac6f1 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -38,19 +38,14 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 NGINX_IMAGE="nginx:1.27.4"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
-SQUASH_DIR="/mnt/vast/squash"
+# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
+# pyxis rejects '+' in image paths with "Invalid image format", and the
+# old /mnt/vast/squash dir contains '+'-separated files from prior runs.
+SQUASH_DIR="/mnt/vast/squash_dupe"
 mkdir -p "$SQUASH_DIR"
 SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-# Some images were imported with '+' separators (enroot's default) rather
-# than '_'. Check for the '+' variant and symlink so both names resolve.
-SQUASH_FILE_PLUS="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh"
-if [ ! -f "$SQUASH_FILE" ] && [ -f "$SQUASH_FILE_PLUS" ]; then
-    ln -sf "$SQUASH_FILE_PLUS" "$SQUASH_FILE"
-    echo "[squash] symlinked $SQUASH_FILE -> $SQUASH_FILE_PLUS"
-fi
-
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 

From 3882a553fe7f19ffc7f4c9d39c122065383b29f8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 17:32:14 -0700
Subject: [PATCH 10/16] =?UTF-8?q?pin=20dynamo=20to=201.2.0.dev20260426=20?=
 =?UTF-8?q?=E2=80=94=20first=20arm64=20wheel=20with=20DSv4=20formatter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml      | 10 ++++--
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 35 ++++++++++---------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
index 1b95cd936..d94fd569b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -28,10 +28,14 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
-# Use the container-bundled dynamo (skip srtctl pip install). Same
-# rationale as the TP4 sibling - see its header for the casualty list.
+# Pin a dynamo dev wheel containing the DSv4 formatter (hash 6a159fed,
+# 2026-04-23). See the TP4 sibling header for the full rationale and
+# casualty list — the lmsysorg sglang arm64 container ships no
+# ai-dynamo, so install: false dies with ModuleNotFoundError, and any
+# stable version <=1.0.2 lacks the formatter and 404s.
 dynamo:
-  install: false
+  install: true
+  version: "1.2.0.dev20260426"
 
 sbatch_directives:
   segment: "2"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index c35fe4ec0..aa00c9f8a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -41,24 +41,27 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
-# Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4-
-# grace-blackwell image ships a dynamo build with the native Rust
-# DeepSeekV4Formatter (added at hash 6a159fed, 2026-04-23 — see comment
-# in the gb200 vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). That
-# formatter auto-detects DSv4 by model name and serves /v1/completions
-# without needing chat_template in tokenizer_config.json.
+# Pin a dynamo dev wheel that contains the native Rust DeepSeekV4Formatter
+# (added at hash 6a159fed on 2026-04-23 — see the comment in the gb200
+# vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). The 2026-04-26 dev wheel
+# from pypi.nvidia.com is the first wheel post that hash with both
+# ai-dynamo and ai-dynamo-runtime aarch64 builds. Without the formatter,
+# the dynamo frontend rejects DSv4 at pipeline build time with
+# "chat_template field is required in the tokenizer_config.json file"
+# and 404s every request — that's what runs #24963242956 and the
+# follow-up hit on stable 0.8.x.
 #
-# `install: false` is critical here — without it, srtctl's schema
-# default (install: True, version: "0.8.0", see srt-slurm/src/srtctl/
-# core/schema.py:697) pip-installs ai-dynamo==0.8.0 from PyPI on top
-# of the container, which predates the DSv4 formatter. The frontend
-# then 404s on every request: PromptFormatter.from_mdc rejects the
-# model at pipeline build time with "chat_template field is required
-# in the tokenizer_config.json file". Casualties: runs #24963242956
-# (had dynamo.version: 0.8.1 explicit) and the follow-up (no dynamo
-# block, fell through to the 0.8.0 default).
+# The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container does
+# NOT bundle ai-dynamo, so install: false is wrong here (gives
+# "ModuleNotFoundError: No module named 'dynamo'" the moment srtctl
+# tries to launch python3 -m dynamo.sglang). The gb200 vllm sibling
+# solves the same gap with hash + install: true + a setup_script that
+# pulls a prebuilt wheel from /mnt/vast/dynamo_cache; we don't have that
+# cache yet for SGLang, so we just let srtctl pip-install the dev wheel
+# per rank from pypi.nvidia.com — same payload, slower per-rank install.
 dynamo:
-  install: false
+  install: true
+  version: "1.2.0.dev20260426"
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
 # can land on different racks and pay the cross-rack hop on every NIXL

From 77bbcb8a4f552e4ceed15c1b62244e5ccdddd6e9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 18:10:21 -0700
Subject: [PATCH 11/16] =?UTF-8?q?step=20back=20to=20dynamo=20dev20260425?=
 =?UTF-8?q?=20=E2=80=94=20earlier=20wheel=20may=20align=20with=20container?=
 =?UTF-8?q?'s=20bundled=20sglang?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml     | 2 +-
 .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
index d94fd569b..6e82557ae 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -35,7 +35,7 @@ model:
 # stable version <=1.0.2 lacks the formatter and 404s.
 dynamo:
   install: true
-  version: "1.2.0.dev20260426"
+  version: "1.2.0.dev20260425"
 
 sbatch_directives:
   segment: "2"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index aa00c9f8a..68ea73080 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -61,7 +61,7 @@ model:
 # per rank from pypi.nvidia.com — same payload, slower per-rank install.
 dynamo:
   install: true
-  version: "1.2.0.dev20260426"
+  version: "1.2.0.dev20260425"
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
 # can land on different racks and pay the cross-rack hop on every NIXL

From d7dc646431b2e8138021985ddfefd9c1f4c3c3b8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 18:46:45 -0700
Subject: [PATCH 12/16] =?UTF-8?q?prebuild=20dynamo=20wheel=20from=20hash?=
 =?UTF-8?q?=206a159fed=20on=20/mnt/vast=20=E2=80=94=20mirror=20PR=20#1150?=
 =?UTF-8?q?=20vllm=20pattern=20for=20sglang?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml      | 20 +++--
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 42 ++++-----
 runners/gb300-cw-sglang-container-deps.sh     | 44 ++++++++++
 runners/launch_gb300-cw.sh                    | 86 +++++++++++++++++++
 4 files changed, 165 insertions(+), 27 deletions(-)
 create mode 100755 runners/gb300-cw-sglang-container-deps.sh

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
index 6e82557ae..4f514d394 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -28,14 +28,20 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
-# Pin a dynamo dev wheel containing the DSv4 formatter (hash 6a159fed,
-# 2026-04-23). See the TP4 sibling header for the full rationale and
-# casualty list — the lmsysorg sglang arm64 container ships no
-# ai-dynamo, so install: false dies with ModuleNotFoundError, and any
-# stable version <=1.0.2 lacks the formatter and 404s.
+# Build dynamo from hash 6a159fed via prebuild cache. See the TP4
+# sibling header for the full rationale and the casualty timeline —
+# short version: arm64 container ships no ai-dynamo, dev wheels API-
+# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror
+# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall
+# from /mnt/vast/dynamo_cache/<hash> per rank.
 dynamo:
-  install: true
-  version: "1.2.0.dev20260425"
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
 
 sbatch_directives:
   segment: "2"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 68ea73080..86b262cfc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -41,27 +41,29 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
   precision: "fp4"
 
-# Pin a dynamo dev wheel that contains the native Rust DeepSeekV4Formatter
-# (added at hash 6a159fed on 2026-04-23 — see the comment in the gb200
-# vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). The 2026-04-26 dev wheel
-# from pypi.nvidia.com is the first wheel post that hash with both
-# ai-dynamo and ai-dynamo-runtime aarch64 builds. Without the formatter,
-# the dynamo frontend rejects DSv4 at pipeline build time with
-# "chat_template field is required in the tokenizer_config.json file"
-# and 404s every request — that's what runs #24963242956 and the
-# follow-up hit on stable 0.8.x.
-#
-# The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container does
-# NOT bundle ai-dynamo, so install: false is wrong here (gives
-# "ModuleNotFoundError: No module named 'dynamo'" the moment srtctl
-# tries to launch python3 -m dynamo.sglang). The gb200 vllm sibling
-# solves the same gap with hash + install: true + a setup_script that
-# pulls a prebuilt wheel from /mnt/vast/dynamo_cache; we don't have that
-# cache yet for SGLang, so we just let srtctl pip-install the dev wheel
-# per rank from pypi.nvidia.com — same payload, slower per-rank install.
+# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling
+# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace-
+# blackwell_arm64 image lacks both a working ai-dynamo and the rust
+# toolchain for an in-container build; pinning a published dev wheel
+# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat
+# shim warns then disagg startup warmup hangs). Same prebuild-cache
+# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel
+# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/<hash>,
+# and the setup_script below force-reinstalls from cache per rank
+# (~30 s, no per-rank rust build, no API drift).
 dynamo:
-  install: true
-  version: "1.2.0.dev20260425"
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  # install: false → srtctl skips its own pip install; setup_script is
+  # the sole installer.
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
 # can land on different racks and pay the cross-rack hop on every NIXL
diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh
new file mode 100755
index 000000000..e25362cd5
--- /dev/null
+++ b/runners/gb300-cw-sglang-container-deps.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Custom container-deps installer for gb300-cw + sglang. pip-installs
+# dynamo from a wheel + source archive that launch_gb300-cw.sh pre-built
+# on /mnt/vast BEFORE submitting sbatch.
+#
+# Why the prebuild design (mirrors the vllm sibling at
+# gb300-cw-vllm-container-deps.sh from PR #1150):
+#   srt-slurm's per-rank install path runs `maturin build` inside every
+#   container srtctl srun's. The lmsysorg/sglang:deepseek-v4-grace-
+#   blackwell_arm64 image lacks rust pre-installed, so the per-rank
+#   build path can't run; pinning a published dev wheel (1.2.0.dev*)
+#   trips API drift against the bundled sglang 0.5.9 (compat shim
+#   warning + disagg startup warmup hang — see runs ending 2026-04-27).
+#   Building dynamo ONCE from hash 6a159fed (the same commit the gb200
+#   vllm recipe pins, known to be sglang-API-stable) on a single-node
+#   srun in launch_gb300-cw.sh sidesteps both: every rank pip-installs
+#   from the cache here (~30 s, no contention).
+#
+#   Used in tandem with `dynamo.install: false` in the gb300-cw sglang
+#   recipes so srt-slurm's hardcoded install path is skipped and this
+#   script is the sole installer.
+
+set -e
+
+DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}"
+CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH"
+DONE_MARKER="$CACHE_DIR/.done"
+
+if [ ! -f "$DONE_MARKER" ]; then
+    echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2
+    echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2
+    exit 1
+fi
+
+echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR"
+pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
+
+rm -rf /tmp/dynamo_build
+mkdir -p /tmp/dynamo_build/dynamo
+tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo
+cd /tmp/dynamo_build/dynamo
+pip install --break-system-packages -e .
+
+echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index cfa9ac6f1..b03dc6dd9 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -49,6 +49,84 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
+# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting
+# the main sbatch. The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
+# image lacks a working ai-dynamo (install: false → ModuleNotFoundError),
+# and pinning a published dev wheel (1.2.0.dev*) trips API drift against
+# the bundled sglang 0.5.9 (compat shim warns then disagg startup warmup
+# hangs — see runs ending 2026-04-27). Building from hash 6a159fed (the
+# same commit the gb200 vllm sibling pins, known sglang-API-stable) on
+# a single dedicated srun eliminates per-rank coordination on /mnt/vast
+# (NFS flock is unreliable). Same pattern as PR #1150's vllm launcher.
+DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b"
+DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache"
+DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH"
+DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done"
+mkdir -p "$DYNAMO_CACHE_ROOT"
+
+if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
+    echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..."
+    # Build into a unique temp dir, then atomically mv into place. Two
+    # concurrent runners may both build; the first to finish the rename
+    # wins, the loser cleans up. Same-directory rename() is atomic on
+    # NFS (unlike flock).
+    TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX")
+    # --mem=0: claim full node memory. Default cgroup is much smaller and
+    # rustc's link phase can OOM otherwise. CARGO_BUILD_JOBS=8 caps
+    # parallelism so peak rustc memory stays bounded on a 72-core Grace
+    # node, and `-C debuginfo=0` cuts per-process memory further.
+    srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \
+         --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \
+         --job-name="${RUNNER_NAME}-prebuild" \
+         --container-image="$SQUASH_FILE" \
+         --no-container-entrypoint --no-container-mount-home \
+         --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \
+         bash -c "
+            set -e
+            apt-get update -qq
+            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
+            if ! command -v cargo &>/dev/null; then
+              curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+              . \$HOME/.cargo/env
+            fi
+            if ! command -v maturin &>/dev/null; then
+              pip install --break-system-packages maturin
+            fi
+            rm -rf /tmp/dynamo_build
+            mkdir -p /tmp/dynamo_build
+            cd /tmp/dynamo_build
+            git clone https://github.com/ai-dynamo/dynamo.git
+            cd dynamo
+            git checkout $DYNAMO_HASH
+            cd lib/bindings/python/
+            export CARGO_BUILD_JOBS=8
+            export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable'
+            maturin build -o '$TEMP_BUILD'
+            cd /tmp/dynamo_build/dynamo
+            tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \
+                --exclude='lib/bindings/python/target' \
+                --exclude='.git' \
+                .
+            touch '$TEMP_BUILD/.done'
+        "
+    if [ -f "$TEMP_BUILD/.done" ]; then
+        # Atomic publish. If another runner already published, mv fails
+        # and we just discard our copy.
+        if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then
+            echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR"
+        else
+            echo "[dynamo-prebuild] another runner published first, discarding our copy"
+            rm -rf "$TEMP_BUILD"
+        fi
+    else
+        echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2
+        rm -rf "$TEMP_BUILD"
+        exit 1
+    fi
+else
+    echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR"
+fi
+
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
 export ISL="$ISL"
@@ -81,6 +159,14 @@ git checkout sa-submission-q2-2026
 mkdir -p recipes/sglang/deepseek-v4
 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
 
+# Drop our cache-installer setup_script next to upstream's configs.
+# Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh`
+# alongside `dynamo.install: false` so srtctl skips its own pip install
+# and this script (force-reinstalling from /mnt/vast/dynamo_cache) is the
+# sole installer per rank.
+cp "$GITHUB_WORKSPACE/runners/gb300-cw-sglang-container-deps.sh" configs/gb300-cw-sglang-container-deps.sh
+chmod +x configs/gb300-cw-sglang-container-deps.sh
+
 echo "Installing srtctl..."
 # CRITICAL — uv install location.
 # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is

From 5e3340c835cd7765355daf5f761976a0783017bb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 20:20:31 -0700
Subject: [PATCH 13/16] =?UTF-8?q?switch=20disagg=20transport=20nixl=20?=
 =?UTF-8?q?=E2=86=92=20mooncake?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml      | 16 ++++---
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 45 ++++++++++---------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
index 4f514d394..b30f5b4d1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -3,7 +3,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
 # DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology:
 #   Prefill: 1 node, TP=4 (no DP-attn, no EP).
 #   Decode:  1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit).
-# Both on a single GB300 (4 GPUs / node). KV transfer over NIXL.
+# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**.
 #
 # Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream
 # PR #75 exactly; this DEP4 variant is a local extension to probe whether
@@ -13,12 +13,14 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
 # Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks
 # see DP=4 replication. SGLang's --disaggregation-decode-tp and
 # --disaggregation-decode-dp flags on the prefill engine carry this
-# metadata so KV chunks route to the correct decode rank during NIXL
+# metadata so KV chunks route to the correct decode rank during the
 # transfer (server_args.py:643-654, validate_disagg_tp_size).
 #
-# Same NIXL state-buffer-transfer caveat as the TP4 sibling - see
-# disagg-gb300-1p1d-tp4.yaml header. The grace-blackwell image build
-# carries the patch.
+# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling.
+# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container
+# regardless of dynamo version (run 24973148979 with hash 6a159fed +
+# prebuild cache still hit the same watchdog timeout). PR #75 calls
+# out Mooncake as the working transport for state buffers.
 #
 # Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same
 # way as the symmetric sibling.
@@ -103,7 +105,7 @@ backend:
       trust-remote-code: true
       tensor-parallel-size: 4
       disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
       # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so
       # KV chunks route to the correct decode rank during NIXL transfer.
       disaggregation-decode-tp: 4
@@ -129,7 +131,7 @@ backend:
       moe-a2a-backend: deepep
       deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
       disaggregation-mode: "decode"
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
       moe-runner-backend: "flashinfer_mxfp4"
       disable-flashinfer-autotune: true
       mem-fraction-static: 0.90
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
index 86b262cfc..928f387f3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -7,27 +7,28 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
 #
 # Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at
 # 1P1D-TP4 fits trivially within a single rack; the explicit segment
-# below pins them so the NIXL KV transfer between prefill and decode
-# stays rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto
-# segment so each recipe owns its own value.)
+# below pins them so the KV transfer between prefill and decode stays
+# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment
+# so each recipe owns its own value.)
 #
 # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
-# (4 GPUs / node). KV transfer over NIXL. PR #75 measures saturation
-# at conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
+# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched
+# below — see "Transport: Mooncake"). PR #75 measures saturation at
+# conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
 #
-# ⚠️  NIXL state-buffer-transfer accuracy bug (upstream PR #75 body):
-# the SGLang NIXL backend currently registers and transfers the KV cache
-# correctly but DROPS the model's auxiliary state buffer (SWA / NSA /
-# Mamba). On DSv4-Pro this collapses GSM8K from 1.000 (agg) to ~0.13
-# (disagg) while throughput numbers and KV byte hashes look healthy.
-# Mooncake handles state buffers correctly; the NIXL fix mirrors that
-# (~237 lines extending KVArgsRegisterInfo/TransferInfo/register_buffer_
-# to_engine + adding send_state in
-# python/sglang/srt/disaggregation/nixl/conn.py). Until the upstream
-# sglang fix lands, the patch must be picked up via the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell container build. If
-# eval-only GSM8K runs come back near 0.13 with healthy throughput,
-# that's the cause — not a tuning issue.
+# Transport: Mooncake (not NIXL).
+#   * NIXL hung the prefill startup warmup indefinitely on this stack
+#     (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the
+#     DSv4 formatter — compat shim warns on every worker, then a
+#     4-token warmup probe never runs forward). See runs through
+#     2026-04-27 ~02:35 (gh actions 24973148979) for the exact
+#     watchdog trace.
+#   * PR #75 explicitly notes "Mooncake handles state buffers
+#     correctly" — the disagg accuracy bug it warns about is NIXL-
+#     specific, and switching to Mooncake side-steps both that bug
+#     and our warmup hang.
+#   * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container
+#     ships the Mooncake transport built-in; no extra deps needed.
 #
 # Local deltas vs upstream PR #75:
 #   * benchmark.type = sa-bench (upstream also uses sa-bench in the
@@ -66,8 +67,8 @@ extra_mount:
   - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
 
 # Pin both nodes (1P + 1D) to the same rack on cw. Without this they
-# can land on different racks and pay the cross-rack hop on every NIXL
-# KV transfer.
+# can land on different racks and pay the cross-rack hop on every KV
+# transfer.
 sbatch_directives:
   segment: "2"
   # Use all node memory; cw default is too tight for the MXFP4 worker.
@@ -120,7 +121,7 @@ backend:
       trust-remote-code: true
       tensor-parallel-size: 4
       disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
       moe-runner-backend: "flashinfer_mxfp4"
       disable-flashinfer-autotune: true
       mem-fraction-static: 0.90
@@ -135,7 +136,7 @@ backend:
       trust-remote-code: true
       tensor-parallel-size: 4
       disaggregation-mode: "decode"
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
       moe-runner-backend: "flashinfer_mxfp4"
       disable-flashinfer-autotune: true
       mem-fraction-static: 0.90

From 83867ea50b07735b304389c1a9056ea2011cbbce Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 22:31:05 -0700
Subject: [PATCH 14/16] =?UTF-8?q?strip=20return=5Frouted=5Fexperts=20kwarg?=
 =?UTF-8?q?=20from=20dynamo=20call=20sites=20=E2=80=94=20sglang=200.5.9=20?=
 =?UTF-8?q?Engine.async=5Fgenerate=20doesn't=20accept=20it?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runners/gb300-cw-sglang-container-deps.sh | 48 +++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh
index e25362cd5..c8cb84ad0 100755
--- a/runners/gb300-cw-sglang-container-deps.sh
+++ b/runners/gb300-cw-sglang-container-deps.sh
@@ -42,3 +42,51 @@ cd /tmp/dynamo_build/dynamo
 pip install --break-system-packages -e .
 
 echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
+
+# --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 --------------------------
+# ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls
+# `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9
+# bundled in lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 has an
+# Engine.async_generate signature that doesn't accept that kwarg, so every
+# request 500s with:
+#   TypeError: Engine.async_generate() got an unexpected keyword argument
+#       'return_routed_experts'
+# (See run 24973148979 → mooncake unblocked the disagg warmup; this is the
+# next failure layer.) Strip the kwarg from every call site in the
+# extracted dynamo source. `pip install -e .` above is editable, so the
+# patch propagates immediately at next `python3 -m dynamo.sglang ...`.
+DYNAMO_SRC=/tmp/dynamo_build/dynamo
+patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true)
+if [ -n "$patch_targets" ]; then
+    for f in $patch_targets; do
+        echo "[dynamo-patch] stripping return_routed_experts kwarg in $f"
+        # Match `return_routed_experts=<value>,?` where <value> is anything
+        # up to the next `,` or `)` at the same paren depth. Single-line
+        # case covers >99% of call sites; the value can be False/True/a
+        # var name. Trailing comma + whitespace is consumed too so we
+        # don't leave a stray `, )` behind.
+        python3 - "$f" <<'PYEOF'
+import re, sys
+path = sys.argv[1]
+with open(path) as fh:
+    src = fh.read()
+# Greedy on whitespace, non-greedy on the value (no commas/parens inside).
+new = re.sub(
+    r'return_routed_experts\s*=\s*[^,)]+\s*,?\s*',
+    '',
+    src,
+)
+if new != src:
+    with open(path, 'w') as fh:
+        fh.write(new)
+PYEOF
+    done
+    echo "[dynamo-patch] verifying no return_routed_experts call sites remain..."
+    if grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null; then
+        echo "[dynamo-patch] WARNING: residual matches above (likely defaults / declarations, not call sites). Inspect if 500s persist."
+    else
+        echo "[dynamo-patch] clean"
+    fi
+else
+    echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)"
+fi

From 3efc208cbaba0e8b91eb66fa9ccc0c560120ecac Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 23:20:06 -0700
Subject: [PATCH 15/16] fix dynamo regex: only match whole-line kwarg passes,
 leave assignment intact

---
 runners/gb300-cw-sglang-container-deps.sh | 38 ++++++++++++++---------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh
index c8cb84ad0..fb6e6b6f8 100755
--- a/runners/gb300-cw-sglang-container-deps.sh
+++ b/runners/gb300-cw-sglang-container-deps.sh
@@ -58,35 +58,45 @@ echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
 DYNAMO_SRC=/tmp/dynamo_build/dynamo
 patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true)
 if [ -n "$patch_targets" ]; then
+    # Match WHOLE LINES that are just a kwarg pass:
+    #     return_routed_experts=<simple-identifier-or-attr>,?
+    # The value is constrained to a simple identifier ([A-Za-z_][\w.]*),
+    # which deliberately excludes function calls (no `(` allowed). This
+    # leaves the multi-line assignment statement at decode_handler.py:275
+    # intact:
+    #     return_routed_experts = getattr(
+    #         self.config.server_args, "enable_return_routed_experts", False
+    #     )
+    # That assignment is dead code after we strip the kwarg passes, but
+    # leaving it costs nothing and avoids the syntax-error trap from the
+    # earlier (over-greedy) version of this patch.
     for f in $patch_targets; do
-        echo "[dynamo-patch] stripping return_routed_experts kwarg in $f"
-        # Match `return_routed_experts=<value>,?` where <value> is anything
-        # up to the next `,` or `)` at the same paren depth. Single-line
-        # case covers >99% of call sites; the value can be False/True/a
-        # var name. Trailing comma + whitespace is consumed too so we
-        # don't leave a stray `, )` behind.
+        echo "[dynamo-patch] stripping return_routed_experts kwarg lines in $f"
         python3 - "$f" <<'PYEOF'
 import re, sys
 path = sys.argv[1]
 with open(path) as fh:
     src = fh.read()
-# Greedy on whitespace, non-greedy on the value (no commas/parens inside).
+# Whole-line kwarg pass: indented `return_routed_experts=<simple>,?` then EOL.
+# `[A-Za-z_][\w.]*` matches identifiers, attribute access, True/False/None — but NOT calls.
 new = re.sub(
-    r'return_routed_experts\s*=\s*[^,)]+\s*,?\s*',
+    r'^[ \t]+return_routed_experts\s*=\s*[A-Za-z_][\w.]*\s*,?[ \t]*\n',
     '',
     src,
+    flags=re.MULTILINE,
 )
 if new != src:
     with open(path, 'w') as fh:
         fh.write(new)
+    print(f'[dynamo-patch]   patched: {path}')
+else:
+    print(f'[dynamo-patch]   no kwarg-pass lines matched in: {path}')
 PYEOF
     done
-    echo "[dynamo-patch] verifying no return_routed_experts call sites remain..."
-    if grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null; then
-        echo "[dynamo-patch] WARNING: residual matches above (likely defaults / declarations, not call sites). Inspect if 500s persist."
-    else
-        echo "[dynamo-patch] clean"
-    fi
+    # Sanity: any remaining occurrence is fine if it's the assignment;
+    # log it so the next person knows what's left.
+    echo "[dynamo-patch] residual occurrences (expected: only the dead assignment in decode_handler.py):"
+    grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || echo "  (none)"
 else
     echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)"
 fi

From 173bd41dee956a2b48c37e043289c021463649d6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 23:09:49 -0700
Subject: [PATCH 16/16] PR85

---
 .github/configs/nvidia-master.yaml            |  77 ++++++---
 .../1k1k/disagg-1p1d-dep4-mega-moe.yaml       | 128 ++++++++++++++
 .../1k1k/disagg-1p1d-tp4-mxfp4.yaml           |  86 ++++++++++
 .../disagg-1p2d-dep4-to-dep8-mega-moe.yaml    | 127 ++++++++++++++
 .../1k1k/disagg-2p2d-dep8-mega-moe.yaml       | 126 ++++++++++++++
 .../1k1k/disagg-2p2d-tp8-mxfp4.yaml           |  98 +++++++++++
 .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml      | 153 -----------------
 .../1k1k/disagg-gb300-1p1d-tp4.yaml           | 159 ------------------
 runners/gb300-cw-sglang-container-deps.sh     |  28 +++
 runners/launch_gb300-cw.sh                    |  20 ++-
 10 files changed, 660 insertions(+), 342 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index cb9c0c675..3f905d3c8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7728,49 +7728,82 @@ dsv4-fp4-gb300-dynamo-sglang:
   framework: dynamo-sglang
   multinode: true
   disagg: true
-  # 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL
-  # KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm-
-  # recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm
-  # checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave
-  # (2x 18-node racks); recipes set their own sbatch_directives.segment
-  # for rack pinning.
-  #
-  # Two search-space bands:
-  #   * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give
-  #     single-user latency floor; 4-128 covers the saturation curve
-  #     mirroring NVIDIA/srt-slurm PR #75.
-  #   * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn +
-  #     DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head
-  #     comparison (find the crossover where DPA beats TP-only); 256-
-  #     1024 extends past the symmetric saturation point (~conc=128 /
-  #     838 Total TPS/GPU per PR #75).
+  # Five disagg topologies from NVIDIA/srt-slurm PR #85 branch
+  # recipes/dsv4-agg-disagg, overlaid with cw-specific fields by
+  # launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave (2x 18-node
+  # racks); recipes set their own sbatch_directives.segment for rack
+  # pinning. All use NIXL KV transfer.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+    # 1P1D TP=4 MXFP4 — low-latency baseline (2 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128]
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml"
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-tp4-mxfp4.yaml"
       decode:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-    - conc-list: [16, 32, 64, 128, 256, 512, 1024]
+    # 1P1D DEP4 mega_moe — TEP disagg (2 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
       prefill:
         num-worker: 1
         tp: 4
-        ep: 1
-        dp-attn: false
+        ep: 4
+        dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml"
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-dep4-mega-moe.yaml"
       decode:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
+    # 1P2D asymmetric DEP4->DEP8 mega_moe — best per-GPU efficiency (3 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p2d-dep4-to-dep8-mega-moe.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 2P2D symmetric DEP8 mega_moe — largest throughput (4 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-dep8-mega-moe.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 2P2D TP=8 MXFP4 — TP-only 4-node baseline (4 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-tp8-mxfp4.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml
new file mode 100644
index 000000000..72baef909
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml
@@ -0,0 +1,128 @@
+name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml
new file mode 100644
index 000000000..a0b60a00b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml
@@ -0,0 +1,86 @@
+name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml
new file mode 100644
index 000000000..569373509
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml
@@ -0,0 +1,127 @@
+name: "dsv4-pro-gb300-disagg-1p2d-dep4-to-dep8-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "3"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 2048
+      cuda-graph-max-bs: 2048
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml
new file mode 100644
index 000000000..8d82d58cb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml
@@ -0,0 +1,126 @@
+name: "dsv4-pro-gb300-disagg-2p2d-dep8-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "4"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 2048
+      cuda-graph-max-bs: 2048
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml
new file mode 100644
index 000000000..1b697d826
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml
@@ -0,0 +1,98 @@
+name: "dsv4-pro-gb300-disagg-2p2d-tp8-mxfp4-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "4"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
deleted file mode 100644
index b30f5b4d1..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
+++ /dev/null
@@ -1,153 +0,0 @@
-name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
-
-# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology:
-#   Prefill: 1 node, TP=4 (no DP-attn, no EP).
-#   Decode:  1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit).
-# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**.
-#
-# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream
-# PR #75 exactly; this DEP4 variant is a local extension to probe whether
-# decode-side DP-attn + DeepEP unlocks throughput past the symmetric
-# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75).
-#
-# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks
-# see DP=4 replication. SGLang's --disaggregation-decode-tp and
-# --disaggregation-decode-dp flags on the prefill engine carry this
-# metadata so KV chunks route to the correct decode rank during the
-# transfer (server_args.py:643-654, validate_disagg_tp_size).
-#
-# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling.
-# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container
-# regardless of dynamo version (run 24973148979 with hash 6a159fed +
-# prebuild cache still hit the same watchdog timeout). PR #75 calls
-# out Mooncake as the working transport for state buffers.
-#
-# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same
-# way as the symmetric sibling.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
-  precision: "fp4"
-
-# Build dynamo from hash 6a159fed via prebuild cache. See the TP4
-# sibling header for the full rationale and the casualty timeline —
-# short version: arm64 container ships no ai-dynamo, dev wheels API-
-# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror
-# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall
-# from /mnt/vast/dynamo_cache/<hash> per rank.
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: false
-
-setup_script: gb300-cw-sglang-container-deps.sh
-
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-sbatch_directives:
-  segment: "2"
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-
-  decode_environment:
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-    # MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn +
-    # EP is enabled. Mirror gen_launch.py medium/large defaults.
-    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
-    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
-    SGLANG_OPT_USE_FAST_MASK_EP: "1"
-    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
-    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      tensor-parallel-size: 4
-      disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: mooncake
-      # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so
-      # KV chunks route to the correct decode rank during NIXL transfer.
-      disaggregation-decode-tp: 4
-      disaggregation-decode-dp: 4
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-      mem-fraction-static: 0.90
-      max-running-requests: 128
-      cuda-graph-max-bs: 128
-      chunked-prefill-size: 8192
-      disable-radix-cache: true
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      tensor-parallel-size: 4
-      # Full DP-attn on 4 GPUs: each rank is its own DP unit for
-      # attention; MoE is sharded across EP (ep_size = tp_size = 4
-      # implicit when --moe-a2a-backend deepep).
-      enable-dp-attention: true
-      dp-size: 4
-      moe-a2a-backend: deepep
-      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-      disaggregation-mode: "decode"
-      disaggregation-transfer-backend: mooncake
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-      mem-fraction-static: 0.90
-      max-running-requests: 512
-      cuda-graph-max-bs: 512
-      chunked-prefill-size: 8192
-      disable-radix-cache: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  random_range_ratio: 0.8
-  # Conc 16-128 overlaps the TP4 sibling for head-to-head comparison
-  # (where does decode-side DPA start beating TP-only?); 256-1024
-  # probes throughput past the symmetric saturation point.
-  concurrencies: "16x32x64x128x256x512x1024"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
deleted file mode 100644
index 928f387f3..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
+++ /dev/null
@@ -1,159 +0,0 @@
-name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
-
-# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang +
-# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75
-# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of
-# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree.
-#
-# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at
-# 1P1D-TP4 fits trivially within a single rack; the explicit segment
-# below pins them so the KV transfer between prefill and decode stays
-# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment
-# so each recipe owns its own value.)
-#
-# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
-# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched
-# below — see "Transport: Mooncake"). PR #75 measures saturation at
-# conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
-#
-# Transport: Mooncake (not NIXL).
-#   * NIXL hung the prefill startup warmup indefinitely on this stack
-#     (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the
-#     DSv4 formatter — compat shim warns on every worker, then a
-#     4-token warmup probe never runs forward). See runs through
-#     2026-04-27 ~02:35 (gh actions 24973148979) for the exact
-#     watchdog trace.
-#   * PR #75 explicitly notes "Mooncake handles state buffers
-#     correctly" — the disagg accuracy bug it warns about is NIXL-
-#     specific, and switching to Mooncake side-steps both that bug
-#     and our warmup hang.
-#   * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container
-#     ships the Mooncake transport built-in; no extra deps needed.
-#
-# Local deltas vs upstream PR #75:
-#   * benchmark.type = sa-bench (upstream also uses sa-bench in the
-#     latest revision; matches).
-#   * sbatch_directives.segment + mem: rack-pinning for cw, mirroring
-#     the dynamo-vllm gb300 recipe convention. Upstream targets a
-#     different cluster and doesn't need this.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
-  precision: "fp4"
-
-# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling
-# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace-
-# blackwell_arm64 image lacks both a working ai-dynamo and the rust
-# toolchain for an in-container build; pinning a published dev wheel
-# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat
-# shim warns then disagg startup warmup hangs). Same prebuild-cache
-# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel
-# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/<hash>,
-# and the setup_script below force-reinstalls from cache per rank
-# (~30 s, no per-rank rust build, no API drift).
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # install: false → srtctl skips its own pip install; setup_script is
-  # the sole installer.
-  install: false
-
-setup_script: gb300-cw-sglang-container-deps.sh
-
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin both nodes (1P + 1D) to the same rack on cw. Without this they
-# can land on different racks and pay the cross-rack hop on every KV
-# transfer.
-sbatch_directives:
-  segment: "2"
-  # Use all node memory; cw default is too tight for the MXFP4 worker.
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-
-  decode_environment:
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      tensor-parallel-size: 4
-      disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: mooncake
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-      mem-fraction-static: 0.90
-      max-running-requests: 128
-      cuda-graph-max-bs: 128
-      chunked-prefill-size: 8192
-      disable-radix-cache: true
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      tensor-parallel-size: 4
-      disaggregation-mode: "decode"
-      disaggregation-transfer-backend: mooncake
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-      mem-fraction-static: 0.90
-      max-running-requests: 128
-      cuda-graph-max-bs: 128
-      chunked-prefill-size: 8192
-      disable-radix-cache: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  random_range_ratio: 0.8
-  # Low-latency band — TP4 1P1D saturates near conc=128 on GB300
-  # (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give
-  # single-user latency floor reference; 4-128 covers the saturation
-  # curve. For high-conc Pareto use the DEP variants.
-  concurrencies: "1x2x4x8x16x32x64x128"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh
index fb6e6b6f8..348c436ef 100755
--- a/runners/gb300-cw-sglang-container-deps.sh
+++ b/runners/gb300-cw-sglang-container-deps.sh
@@ -43,6 +43,34 @@ pip install --break-system-packages -e .
 
 echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
 
+# --- NIXL DSv4 state-buffer patch: sglang PR #23773 --------------------------
+# The disagg recipes use NIXL KV transfer. Without this patch, NIXL
+# silently drops auxiliary state buffers (SWA / NSA / Mamba), causing
+# decode-side accuracy to collapse on DSv4-Pro. The patch mirrors what
+# the Mooncake backend already does. See NVIDIA/srt-slurm PR #85 README.
+SGLANG_DIR="${SGLANG_DIR:-/sgl-workspace/sglang}"
+SGLANG_REMOTE="https://github.com/sgl-project/sglang.git"
+SGLANG_PR_NUMBER="23773"
+SGLANG_PR_REF="refs/pull/${SGLANG_PR_NUMBER}/head"
+SGLANG_LOCAL_BRANCH="nixl-dsv4-pr-${SGLANG_PR_NUMBER}"
+
+echo "=== Installing SGLang NIXL DSV4 fix from PR #${SGLANG_PR_NUMBER} ==="
+
+if [ -d "$SGLANG_DIR/.git" ]; then
+    cd "$SGLANG_DIR"
+    git config --global --add safe.directory "$SGLANG_DIR" 2>/dev/null || true
+    if git remote get-url origin >/dev/null 2>&1; then
+        git remote set-url origin "$SGLANG_REMOTE"
+    else
+        git remote add origin "$SGLANG_REMOTE"
+    fi
+    git fetch --depth 1 origin "$SGLANG_PR_REF"
+    git checkout -f -B "$SGLANG_LOCAL_BRANCH" FETCH_HEAD
+    echo "Checked out SGLang PR #${SGLANG_PR_NUMBER} at $(git rev-parse HEAD)"
+else
+    echo "WARNING: $SGLANG_DIR/.git not found; skipping NIXL patch (container may already include fix)"
+fi
+
 # --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 --------------------------
 # ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls
 # `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index b03dc6dd9..edbb55375 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -14,7 +14,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION ==
     # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in
     # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/.
     export MODEL_PATH="/mnt/vast/models/dsv4/"
-    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+    export SRT_SLURM_MODEL_PREFIX="dsv4-pro"
 else
     echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
     exit 1
@@ -150,14 +150,16 @@ fi
 
 git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
-git checkout sa-submission-q2-2026
+git checkout recipes/dsv4-agg-disagg
 
-# Overlay our hand-rolled DSv4 SGLang recipes. NVIDIA/srt-slurm has no
-# upstream sglang DSv4 disagg recipe yet beyond PR #75's 1P1D-TP4
-# entry, so we ship the recipe locally and copy it in here. `cp -rT`
-# overlays onto a possibly-existing upstream stub instead of nesting.
-mkdir -p recipes/sglang/deepseek-v4
-cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
+# Overlay our cw-adapted DSv4 SGLang disagg recipes onto the upstream
+# recipes from PR #85. The upstream recipes at
+# recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/ don't carry
+# cw-specific fields (dynamo.install, setup_script, extra_mount,
+# sbatch_directives), so we overlay locally-maintained copies that add
+# those. `cp -rT` replaces the upstream files in place.
+mkdir -p recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k" recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp
 
 # Drop our cache-installer setup_script next to upstream's configs.
 # Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh`
@@ -223,7 +225,9 @@ model_paths:
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
   dynamo-sglang: ${SQUASH_FILE}
+  dsv4-grace-blackwell: ${SQUASH_FILE}
   "${IMAGE}": ${SQUASH_FILE}
+  nginx: ${NGINX_SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
 # Auto-emission of #SBATCH --segment={total_nodes} is turned off here
 # because each gb300 recipe sets its own segment via sbatch_directives