From 7dae672d3b5271ccdf1bab5f9c5b6f190abcebc8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 13:07:01 -0700
Subject: [PATCH 01/13] Add H100 config: dsv4-fp8-dynamo-vllm (DeepSeek-V4-Pro
 multinode disagg)

Port the DSV4-Pro vLLM recipe from single-node H200 to H100 as multinode
disaggregated serving via Dynamo. The ~862 GB FP8 weights don't fit on one
8xH100-80GB node (640 GB), so each side must own >=2 nodes; with the
h100-multinode pool at 4 nodes, 2P+2D DP16/EP16 per side (32 H100s total)
is the minimum viable shape and fills the pool exactly.

Engine flags match the single-node H200 recipe: deepseek_v4 tokenizer,
tool-call, and reasoning parsers; FP8 KV cache; block size 256; prefix
caching disabled; compilation mode 0 with FULL_DECODE_ONLY cudagraph.
max-model-len is capped at 16384 (H200's 800k does not fit KV across two
80GB decode nodes). Keeps H100-tuned knobs from the DSR1 vLLM recipe:
VLLM_MOE_DP_CHUNK_SIZE=192, deepep_{high_throughput,low_latency} all2all
backends, NixlConnector P<->D KV transfer, VLLM_USE_DEEP_GEMM, dynamo 1.0.1.

srt-slurm recipes are bundled locally at benchmarks/multi_node/srt_slurm_recipes/
and overlaid onto the srt-slurm clone at runtime. This is temporary until
the recipes can be upstreamed to NVIDIA/srt-slurm.

Changes:
- recipes: benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/
  {1k1k,8k1k}/disagg-h100-fp8-1p1d-dep16-dep16.yaml
- runner: launch_h100-dgxc-slurm.sh gains a dynamo-vllm framework branch
  (dsv4-fp8 model path at /mnt/nfs/lustre/models/dsv4-fp8, vLLM container
  squash mapping, srtslurm.yaml dynamo-vllm alias) and an unconditional
  local-recipes overlay after the srt-slurm checkout
- master: .github/configs/nvidia-master.yaml adds dsv4-fp8-h100-dynamo-vllm
  with 1k1k conc [4,8,16,32,64,128] and 8k1k conc [4,8,16,32,64]

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  50 +++++++
 .../disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 123 ++++++++++++++++++
 .../disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 114 ++++++++++++++++
 perf-changelog.yaml                           |  14 ++
 runners/launch_h100-dgxc-slurm.sh             |  24 +++-
 5 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 96273444f..bc1a871ff 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2384,6 +2384,56 @@ dsv4-fp8-h200-vllm:
     search-space:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
 
+# DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm.
+# 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool).
+# Minimum viable disagg shape: DSV4-Pro FP8 weights (~862 GB) don't fit on one
+# H100 node (8x80GB=640GB), so each side must own >=2 nodes. Recipes bundled
+# locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed.
+dsv4-fp8-h100-dynamo-vllm:
+  image: vllm/vllm-openai:deepseekv4-cu129
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: h100-multinode
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [4, 8, 16, 32, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 1
+        ep: 16
+        dp-attn: true
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 16
+        dp-attn: true
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - conc-list: [4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 1
+        ep: 16
+        dp-attn: true
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 16
+        dp-attn: true
+
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64
   model: Qwen/Qwen3.5-397B-A17B-FP8
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
new file mode 100644
index 000000000..1aaaf65e3
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -0,0 +1,123 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU)
+#
+# Forked from NVIDIA/srt-slurm recipes/vllm/deepseek-r1/disagg-h100-16gpu.yaml.
+# Engine flags updated to match the single-node H200 DSV4 recipe (deepseek_v4
+# tokenizer/parsers, FP8 KV cache, block_size=256, prefix caching disabled,
+# compilation mode 0). Kept from DSR1: NixlConnector P<->D KV transfer,
+# VLLM_MOE_DP_CHUNK_SIZE=192 for H100 80GB (vs H200 141GB default of 384),
+# deepep all2all backends, VLLM_USE_DEEP_GEMM.
+#
+# max-model-len is 16384, not H200's 800000 — KV for 800k context does not
+# fit across two 80GB decode nodes.
+#
+# DP+EP configuration:
+# - Each GPU runs its own vLLM process (tensor-parallel-size: 1)
+# - 1 prefill endpoint x 16 GPUs (2 nodes, DP16) -> 16 prefill processes
+# - 1 decode endpoint  x 16 GPUs (2 nodes, DP16) -> 16 decode processes
+# - Total: 32 GPUs across 4 nodes (fills the h100-multinode pool)
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_MOE_DP_CHUNK_SIZE: "192"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  vllm_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      reasoning-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      all2all-backend: "deepep_high_throughput"
+      data-parallel-hybrid-lb: true
+      tensor-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      async-scheduling: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      reasoning-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      all2all-backend: "deepep_low_latency"
+      data-parallel-hybrid-lb: true
+      tensor-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      async-scheduling: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
new file mode 100644
index 000000000..471efcb5b
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -0,0 +1,114 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) — 8k/1k
+#
+# Same engine flags as the 1k1k variant. Only the benchmark block differs
+# (ISL=8192, tighter concurrency sweep due to larger prefill work).
+#
+# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+# for the full rationale (FP8 KV cache, block_size=256, deepseek_v4 parsers,
+# NixlConnector, H100-tuned VLLM_MOE_DP_CHUNK_SIZE=192).
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-8k1k"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_MOE_DP_CHUNK_SIZE: "192"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  vllm_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      reasoning-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      all2all-backend: "deepep_high_throughput"
+      data-parallel-hybrid-lb: true
+      tensor-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      async-scheduling: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      reasoning-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      all2all-backend: "deepep_low_latency"
+      data-parallel-hybrid-lb: true
+      tensor-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      async-scheduling: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
+  req_rate: "inf"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2b2e138c8..d951b92da 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1745,3 +1745,17 @@
     - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
     - "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130
+
+- config-keys:
+    - dsv4-fp8-h100-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP8 H100 multinode disagg benchmark via dynamo-vllm"
+    - "2 prefill nodes + 2 decode nodes (32 H100s total, DP16/EP16 per side)"
+    - "Image: vllm/vllm-openai:deepseekv4-cu129"
+    - "Engine flags match H200 single-node recipe (deepseek_v4 tokenizer/parsers, FP8 KV cache, block size 256, prefix caching disabled)"
+    - "max-model-len 16384 (H100 80GB KV headroom; H200's 800k does not fit across 2 decode nodes)"
+    - "VLLM_MOE_DP_CHUNK_SIZE=192 and deepep_{high_throughput,low_latency} all2all backends tuned for H100 80GB"
+    - "NixlConnector P<->D KV transfer, dynamo 1.0.1"
+    - "srt-slurm recipes bundled locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed"
+    - "Configs: 1k1k conc 4-128, 8k1k conc 4-64"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 5a2ab64d2..33efc3e6c 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -29,8 +29,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             echo "Unsupported model prefix/precision for dynamo-trt: $MODEL_PREFIX/$PRECISION"
             exit 1
         fi
+    elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp8" ]]; then
+            export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8"
+            export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro"
+            export SRT_SLURM_MODEL_PREFIX="dsv4-fp8"
+        else
+            echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION"
+            exit 1
+        fi
     else
-        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang"
+        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm"
         exit 1
     fi
 
@@ -45,6 +54,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
 
+    # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until
+    # the upstream PR lands; cp -r merges directories on GNU cp.
+    LOCAL_RECIPES_DIR="$GITHUB_WORKSPACE/benchmarks/multi_node/srt_slurm_recipes"
+    if [ -d "$LOCAL_RECIPES_DIR" ]; then
+        echo "Overlaying local srt-slurm recipes from $LOCAL_RECIPES_DIR"
+        cp -r "$LOCAL_RECIPES_DIR"/* recipes/
+    fi
+
     echo "Installing srtctl..."
     export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin"
     export UV_CACHE_DIR="/mnt/nfs/sa-shared/.uv/cache"
@@ -78,6 +95,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#)
         CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|')
         SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh"
+    elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        # vLLM container mapping - IMAGE is a Docker Hub reference (no registry prefix swap)
+        CONTAINER_KEY="$IMAGE"
+        SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh"
     fi
 
     export ISL="$ISL"
@@ -105,6 +126,7 @@ model_paths:
 containers:
   dynamo-trtllm: "${SQUASH_FILE}"
   dynamo-sglang: "${SQUASH_FILE}"
+  dynamo-vllm: "${SQUASH_FILE}"
   nginx-sqsh: "${NGINX_SQUASH_FILE}"
   latest: "${SQUASH_FILE}"
   "${CONTAINER_KEY}": "${SQUASH_FILE}"

From 0cd54afb3495a3f5701d796f69f3f693f1d6a8d9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 13:07:43 -0700
Subject: [PATCH 02/13] Update perf-changelog pr-link to PR 1142

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d951b92da..af66beced 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1758,4 +1758,4 @@
     - "NixlConnector P<->D KV transfer, dynamo 1.0.1"
     - "srt-slurm recipes bundled locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed"
     - "Configs: 1k1k conc 4-128, 8k1k conc 4-64"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1142

From 88b80c01f1c403e5ee0e225d69daa420b1ad5ee5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 13:31:42 -0700
Subject: [PATCH 03/13] launch_h100: pre-create logs dir and tar outputs/ on
 early failure

Sweep 24909864822 had all three multinode jobs fail in 6s with
ExitCode=1:0 and no sweep_JOBID.log written, leaving no usable
diagnostic in the CI artifact. Two defensive changes:

1. mkdir -p outputs/$JOB_ID/logs before polling, so Slurm's
   #SBATCH --output=outputs/%j/logs/sweep_%j.log directive can
   open the target file even when the compute-node stepd lacks
   permission to create the parent dir on NFS.

2. On the "job failed before creating log file" path, tar
   outputs/$JOB_ID/ (sbatch_script.sh, config.yaml, any partial
   log, and the scontrol dump) into multinode_server_logs.tar.gz
   so the CI artifact captures what was submitted and why Slurm
   exited early. Previously exit 1 ran before the tar step.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_h100-dgxc-slurm.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 33efc3e6c..c75507e33 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -178,11 +178,21 @@ EOF
     LOGS_DIR="outputs/$JOB_ID/logs"
     LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
 
+    # Defensive: pre-create the logs subdir so Slurm's #SBATCH --output=...
+    # /%j/logs/sweep_%j.log can open the target file even on NFS mounts
+    # where the compute-node Slurm stepd lacks permission to mkdir -p.
+    mkdir -p "$LOGS_DIR" 2>/dev/null || true
+
     # Wait for log file to appear (also check job is still alive)
     while ! ls "$LOG_FILE" &>/dev/null; do
         if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
             echo "ERROR: Job $JOB_ID failed before creating log file"
-            scontrol show job "$JOB_ID"
+            scontrol show job "$JOB_ID" | tee "outputs/$JOB_ID/scontrol_show_job.txt" 2>/dev/null
+            # Preserve sbatch_script.sh, config.yaml, metadata, and any partial
+            # log so the failure can be diagnosed from the CI artifact.
+            if [ -d "outputs/$JOB_ID" ]; then
+                tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "outputs/$JOB_ID" .
+            fi
             exit 1
         fi
         echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."

From e0359c67db71b53697ca128ad2669862f18f8458 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 13:59:08 -0700
Subject: [PATCH 04/13] Fix dsv4 dynamo-vllm: switch to
 alec-flowers/srt-slurm@PR71 fork

PR 1142's first real sweep hit "ModuleNotFoundError: No module named
'vllm.inputs.data'" on all three multinode jobs. Same error as PR 1129
on GB200.

Root cause: ai-dynamo 1.0.1 (installed by NVIDIA/srt-slurm@sa-submission-q2-2026
via `dynamo: { version: 1.0.1 }`) imports vllm.inputs.data.TokensPrompt,
a path removed in the DSV4 vLLM wheel. Dynamo workers crash during
import before any vLLM flag matters.

Fix, mirroring PR 1129:
- launch_h100-dgxc-slurm.sh: override srt-slurm clone URL/ref via
  SRT_SLURM_REPO_URL and SRT_SLURM_REF env vars, set to
  alec-flowers/srt-slurm@d60e3f1c (head of NVIDIA/srt-slurm#71) for
  dynamo-vllm+dsv4. All other frameworks/models keep NVIDIA upstream.
- Recipes: replace `dynamo.version: 1.0.1` with `dynamo.hash:
  6a159fedd8e4a1563aa647c31f622aedbf254b5b`. The fork's schema accepts
  `hash:` for pinning a specific ai-dynamo/dynamo commit. That commit
  has the matching vllm.inputs import path.
- Recipes: adopt DSV4-specific flags PR 1129 proved necessary for
  startup: `enforce-eager: true` (prefill only), `enable-sleep-mode: true`,
  `no-disable-hybrid-kv-cache-manager: true`, explicit
  `kv-transfer-config` (NixlConnector kv_both), env vars
  VLLM_SERVER_DEV_MODE=1 and TILELANG_CLEANUP_TEMP_FILES=1.
- Recipes: drop `data-parallel-hybrid-lb` and `async-scheduling` (DSR1
  patterns that PR 1129 omitted on DSV4; keep minimal delta from DSV4
  H200 single-node).

Kept H100-specific knobs: VLLM_MOE_DP_CHUNK_SIZE=192, deepep_{high_throughput,
low_latency} all2all backends, VLLM_USE_DEEP_GEMM. Skipped GB200-only
flags (NCCL_MNNVL_ENABLE, NCCL_NVLS_ENABLE, VLLM_USE_NCCL_SYMM_MEM).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 23 ++++++++++++++-----
 .../disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 23 ++++++++++++++-----
 runners/launch_h100-dgxc-slurm.sh             | 13 +++++++++--
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 1aaaf65e3..fb982b757 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -23,8 +23,11 @@ model:
   container: "vllm/vllm-openai:deepseekv4-cu129"
   precision: "fp8"
 
+# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
+# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71),
+# which extends the dynamo config to accept `hash` as well as `version`.
 dynamo:
-  version: 1.0.1
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
 
 setup_script: vllm-container-deps.sh
@@ -52,6 +55,8 @@ backend:
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
@@ -62,12 +67,15 @@ backend:
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
 
   vllm_config:
     prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       tokenizer-mode: "deepseek_v4"
       tool-call-parser: "deepseek_v4"
@@ -79,19 +87,21 @@ backend:
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
       all2all-backend: "deepep_high_throughput"
-      data-parallel-hybrid-lb: true
       tensor-parallel-size: 1
+      pipeline-parallel-size: 1
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
+      enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
       gpu-memory-utilization: 0.95
-      async-scheduling: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
 
     decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       tokenizer-mode: "deepseek_v4"
       tool-call-parser: "deepseek_v4"
@@ -103,8 +113,8 @@ backend:
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
       all2all-backend: "deepep_low_latency"
-      data-parallel-hybrid-lb: true
       tensor-parallel-size: 1
+      pipeline-parallel-size: 1
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
@@ -112,7 +122,8 @@ backend:
       max-num-seqs: 512
       max-num-batched-tokens: 512
       gpu-memory-utilization: 0.95
-      async-scheduling: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
       compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
 
 benchmark:
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 471efcb5b..8e194e719 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -14,8 +14,11 @@ model:
   container: "vllm/vllm-openai:deepseekv4-cu129"
   precision: "fp8"
 
+# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
+# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71),
+# which extends the dynamo config to accept `hash` as well as `version`.
 dynamo:
-  version: 1.0.1
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
 
 setup_script: vllm-container-deps.sh
@@ -43,6 +46,8 @@ backend:
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
@@ -53,12 +58,15 @@ backend:
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
 
   vllm_config:
     prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       tokenizer-mode: "deepseek_v4"
       tool-call-parser: "deepseek_v4"
@@ -70,19 +78,21 @@ backend:
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
       all2all-backend: "deepep_high_throughput"
-      data-parallel-hybrid-lb: true
       tensor-parallel-size: 1
+      pipeline-parallel-size: 1
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
+      enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
       gpu-memory-utilization: 0.95
-      async-scheduling: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
 
     decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       tokenizer-mode: "deepseek_v4"
       tool-call-parser: "deepseek_v4"
@@ -94,8 +104,8 @@ backend:
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
       all2all-backend: "deepep_low_latency"
-      data-parallel-hybrid-lb: true
       tensor-parallel-size: 1
+      pipeline-parallel-size: 1
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
@@ -103,7 +113,8 @@ backend:
       max-num-seqs: 512
       max-num-batched-tokens: 512
       gpu-memory-utilization: 0.95
-      async-scheduling: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
       compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
 
 benchmark:
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index c75507e33..3a279b6d3 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -34,6 +34,15 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8"
             export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro"
             export SRT_SLURM_MODEL_PREFIX="dsv4-fp8"
+            # NVIDIA/srt-slurm@sa-submission-q2-2026 installs ai-dynamo 1.0.1,
+            # which imports vllm.inputs.data.TokensPrompt — a path the DSV4
+            # vLLM wheel has removed. Switch to alec-flowers' fork (head of
+            # https://github.com/NVIDIA/srt-slurm/pull/71) which supports
+            # dynamo.hash pinning so the recipe can pick a dynamo commit
+            # compatible with the DSV4 vllm.inputs layout. Matches PR #1129
+            # on GB200.
+            export SRT_SLURM_REPO_URL="https://github.com/alec-flowers/srt-slurm.git"
+            export SRT_SLURM_REF="d60e3f1c7921721e52af01afaab59a70a1631106"
         else
             echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION"
             exit 1
@@ -50,9 +59,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    git clone "${SRT_SLURM_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
+    git checkout "${SRT_SLURM_REF:-sa-submission-q2-2026}"
 
     # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until
     # the upstream PR lands; cp -r merges directories on GNU cp.

From b92ef5aa5d2e6d761ac9469a9033dc4250c0f758 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 14:13:50 -0700
Subject: [PATCH 05/13] dsv4 h100 recipes: drop API-server-only flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynamo vLLM worker argparse rejects --enable-auto-tool-choice and
--tool-call-parser — the sweep from e0359c67 got past the module-import
error but failed with "unrecognized arguments: --enable-auto-tool-choice
--tool-call-parser deepseek_v4" during prefill worker startup.

These flags (along with --tokenizer-mode and --reasoning-parser) are
OpenAI API-server concerns. In disagg, Dynamo is the frontend and does
tokenization / tool parsing itself; the vLLM workers are engine-only
processes and expose only engine args. The H200 single-node recipe
uses `vllm serve` directly (full API server), which is why those flags
work there but fail here.

Kimi K2.5 (only other working dynamo-vllm recipe) also omits all four
flags — that's the precedent.

Removed from both prefill and decode:
  tokenizer-mode: deepseek_v4
  tool-call-parser: deepseek_v4
  reasoning-parser: deepseek_v4
  enable-auto-tool-choice: true

Kept trust-remote-code: true (needed for DSV4's custom modeling code).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml      | 13 +++++--------
 .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml      | 11 +++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index fb982b757..710376db3 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -74,13 +74,14 @@ backend:
     PYTHONUNBUFFERED: "1"
 
   vllm_config:
+    # Tokenizer mode, tool-call parser, reasoning parser, and
+    # enable-auto-tool-choice are OpenAI API-server flags; Dynamo is the
+    # frontend in this disagg setup and handles tool/reasoning parsing
+    # itself. The vLLM workers are engine-only processes and their argparse
+    # rejects those flags (matches kimi-k2.5 recipe which omits them too).
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      reasoning-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
       trust-remote-code: true
       kv-cache-dtype: "fp8"
       block-size: 256
@@ -103,10 +104,6 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      reasoning-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
       trust-remote-code: true
       kv-cache-dtype: "fp8"
       block-size: 256
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 8e194e719..5dd7e4d6c 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -65,13 +65,12 @@ backend:
     PYTHONUNBUFFERED: "1"
 
   vllm_config:
+    # API-server flags (tokenizer-mode, tool-call-parser, reasoning-parser,
+    # enable-auto-tool-choice) are handled by Dynamo frontend, not the
+    # vLLM engine workers. See 1k1k recipe for rationale.
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      reasoning-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
       trust-remote-code: true
       kv-cache-dtype: "fp8"
       block-size: 256
@@ -94,10 +93,6 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      reasoning-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
       trust-remote-code: true
       kv-cache-dtype: "fp8"
       block-size: 256

From b7336fdf9209693a3c586a3f614c12089dd47a9b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 14:41:56 -0700
Subject: [PATCH 06/13] dsv4 h100 recipes: route around NVSHMEM IPC failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers got past module import and weight load (471s), then died
simultaneously with:

  /dvs/p4/build/sw/rel/gpgpu/toolkit/r12.9/main_nvshmem/src/host/mem/
  mem_heap.cpp:exchange_heap_memory_handle:781: Fatal IPC Failure
  IPC failure: Sending data over socket failed: No such file or directory

Root cause: `all2all-backend: deepep_{high_throughput,low_latency}`
routes expert-parallel comms through NVSHMEM. The cu129 DSV4 vLLM
wheel's NVSHMEM can't complete host-side IPC bootstrap after the
workers enter the executor init phase. DSR1 on the same H100 nodes
uses deepep successfully, but through a different container
(nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0) with an older NVSHMEM.

Fix — mirror PR 1129's GB200 approach:
1. Drop the `all2all-backend` override entirely. The DSV4 vLLM code
   picks its own default for this model, which routes through NCCL
   symmetric memory instead of NVSHMEM.
2. Add env vars:
     VLLM_USE_NCCL_SYMM_MEM=1  (prefer NCCL symm mem path)
     NCCL_CUMEM_ENABLE=1       (CUDA unified memory companion)

Skipped NCCL_MNNVL_ENABLE and NCCL_NVLS_ENABLE (Blackwell-only; MNNVL
is GB200 NVSwitch fabric, NVLS is NVLink SHARP — neither exists on
H100). Keeps all H100-specific knobs (VLLM_USE_DEEP_GEMM,
VLLM_MOE_DP_CHUNK_SIZE=192, VLLM_SKIP_P2P_CHECK).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml       | 12 ++++++++++--
 .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml       |  8 ++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 710376db3..26bc1bfe7 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -50,8 +50,16 @@ backend:
   type: vllm
   connector: nixl
 
+  # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL
+  # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM
+  # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp
+  # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is
+  # the companion flag. Matches PR 1129 GB200 (where deepep's NVSHMEM has
+  # the same issue).
   prefill_environment:
     VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -63,6 +71,8 @@ backend:
 
   decode_environment:
     VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
     VLLM_MOE_DP_CHUNK_SIZE: "192"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
@@ -87,7 +97,6 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      all2all-backend: "deepep_high_throughput"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-size: 16
@@ -109,7 +118,6 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      all2all-backend: "deepep_low_latency"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-size: 16
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 5dd7e4d6c..71df9989b 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -41,8 +41,12 @@ backend:
   type: vllm
   connector: nixl
 
+  # See 1k1k recipe for the rationale behind VLLM_USE_NCCL_SYMM_MEM /
+  # NCCL_CUMEM_ENABLE — the DSV4 wheel's NVSHMEM fails IPC bootstrap.
   prefill_environment:
     VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -54,6 +58,8 @@ backend:
 
   decode_environment:
     VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
     VLLM_MOE_DP_CHUNK_SIZE: "192"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
@@ -76,7 +82,6 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      all2all-backend: "deepep_high_throughput"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-size: 16
@@ -98,7 +103,6 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      all2all-backend: "deepep_low_latency"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
       data-parallel-size: 16

From 71ac58a85bc3bcd28a566ab129275c975669c7fc Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 15:29:06 -0700
Subject: [PATCH 07/13] dsv4 h100 recipes: lower gpu-memory-utilization 0.95 ->
 0.85
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 24913192394 got past every prior failure (NVSHMEM/IPC, module
import, argparse) but OOMed during compile_or_warm_up_model:

  torch.OutOfMemoryError: CUDA out of memory.
  Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 79.19 GiB
  of which 93.00 MiB is free. PyTorch: 72.99 GiB | CUDA Graphs: 1.28 GiB
  File ".../vllm/model_executor/layers/sparse_attn_indexer.py", line 122

DSV4's "Lightning Indexer" sparse attention layer allocates transient
torch.empty buffers that aren't accounted for in vLLM's KV cache
profiling. With gpu-memory-utilization=0.95, vLLM reserves ~75 GiB of
each H100's 79 GiB usable, leaving only ~4 GiB for non-PyTorch state
(NCCL buffers, NVSHMEM scratch, the indexer's transient allocations).
The indexer's 512 MiB allocation tips it over.

The H200 single-node DSV4 recipe uses 0.95 and works because each H200
has 141 GiB/GPU — 4 GiB headroom is enough there. PR 1129 uses 0.88
(prefill) / 0.9 (decode) on GB200's 192 GiB. DSR1 H100 disagg uses
vLLM's default 0.9 and works because DSR1's MLA doesn't have the
indexer overhead.

0.85 reserves ~12 GiB headroom on H100 80GB, well above the indexer's
~6 GiB working set.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 14 ++++++++++++--
 .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml     |  8 ++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 26bc1bfe7..48fb5e9c0 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -106,7 +106,12 @@ backend:
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      gpu-memory-utilization: 0.95
+      # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer
+      # transients on top of vLLM's reserved KV/weights/activations. The H200
+      # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB
+      # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and
+      # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB.
+      gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
@@ -126,7 +131,12 @@ backend:
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      gpu-memory-utilization: 0.95
+      # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer
+      # transients on top of vLLM's reserved KV/weights/activations. The H200
+      # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB
+      # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and
+      # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB.
+      gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 71df9989b..4e36149aa 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -91,7 +91,9 @@ backend:
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      gpu-memory-utilization: 0.95
+      # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom
+      # for DSV4's sparse attention indexer.
+      gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
@@ -111,7 +113,9 @@ backend:
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      gpu-memory-utilization: 0.95
+      # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom
+      # for DSV4's sparse attention indexer.
+      gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'

From 5ce459becabbb26bf79a9e1d6eaadbff7d4cf070 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 16:21:04 -0700
Subject: [PATCH 08/13] dsv4 h100 recipes: disable sa-bench chat-template path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 24914869373: server starts successfully (eval-only succeeds in 33m,
end-to-end gsm8k completions). The throughput jobs fail before sending
a single request:

  ValueError: Cannot use chat template functions because
  tokenizer.chat_template is not set
  File "/srtctl-benchmarks/sa-bench/benchmark_serving.py", line 346,
  in sample_random_requests
      chat_template_dummy = tokenizer.apply_chat_template(...)

DSV4-Pro's HF tokenizer ships without a chat_template attribute. The
server uses tokenizer-mode=deepseek_v4 (set automatically from the
model's tokenizer_config.json) to handle templating itself, but
sa-bench's prompt-construction path runs a *local* HF
apply_chat_template before sending — and that raises with no template
to apply.

Eval works because lm-eval-harness sends raw messages to
/v1/chat/completions; the server templates them via Dynamo's parser.

Set `use_chat_template: false` on both recipes' benchmark blocks
(matches PR 1129). sa-bench will send raw random text, which is what
the throughput benchmark wants anyway.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml               | 5 +++++
 .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml               | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 48fb5e9c0..90f5938a7 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -147,3 +147,8 @@ benchmark:
   osl: 1024
   concurrencies: "4x8x16x32x64x128"
   req_rate: "inf"
+  # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The
+  # server uses --tokenizer-mode deepseek_v4 to handle templating itself,
+  # but sa-bench's local apply_chat_template path raises ValueError.
+  # Send raw prompts; the server handles formatting.
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 4e36149aa..5cb4ff084 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -126,3 +126,6 @@ benchmark:
   osl: 1024
   concurrencies: "4x8x16x32x64"
   req_rate: "inf"
+  # See 1k1k recipe rationale — DSV4-Pro tokenizer has no chat_template;
+  # the server handles formatting via --tokenizer-mode deepseek_v4.
+  use_chat_template: false

From 65d223f30e60d7492f986f3e1b21f3896decae93 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:11:25 -0700
Subject: [PATCH 09/13] dsv4 h100: add TEP variant + du -sh model size
 diagnostic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand the search space with a TEP-style recipe alongside the existing
DEP, following the dsr1-fp8-h100-dynamo-sglang TEP/DEP split pattern.

The h100-multinode pool is exactly 4 nodes and DSV4-Pro weights need
>=2 nodes per side, so we cannot add more workers (1P+1D = 4 nodes is
the only fit). The TEP variant therefore differs from DEP by changing
each worker's *internal* parallelism, not the worker count:

  DEP (existing): tp=1, dp=16, ep=16, dp-attn=true
                  16 independent attention paths, sharded experts.
                  Better at high concurrency / throughput.

  TEP (new):      tp=16, dp=1, ep=16, dp-attn=false
                  Single replica spread across all 16 GPUs, sharded
                  experts. All 16 GPUs cooperate on each forward pass.
                  Cross-node TP routes attn all-reduce + MoE all2all
                  over IB — expensive per token, but latency wins at
                  small batch sizes (conc 4-32).

Concurrency split per the user's hint ("DEP for high conc, TEP for
low conc"):
  1k1k TEP: [4, 8, 16, 32]    1k1k DEP: [64, 128, 256]
  8k1k TEP: [4, 8, 16]        8k1k DEP: [32, 64, 128]

Also extends the DEP high-conc tail by one point each side
(1k1k 128 -> 256, 8k1k 64 -> 128).

TEP recipe drops `data-parallel-hybrid-lb` (no DP) and lowers
`max-num-seqs` to 64 / `max-num-batched-tokens` to 512 since cudagraph
capture would otherwise reserve memory for batch shapes never reached
at conc<=32. Keeps the existing DSV4 startup workarounds
(VLLM_USE_NCCL_SYMM_MEM, gpu-memory-utilization=0.85, no all2all-backend
override, etc).

Doubles the matrix from 2 to 4 entries (validated via
MultiNodeMatrixEntry).

Also adds `du -sh "$MODEL_PATH"` in the dynamo-vllm branch of
launch_h100-dgxc-slurm.sh so model size shows in CI output — useful
for catching partial downloads or wrong revisions before the 8-min
weight-load step.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  46 ++++++-
 .../disagg-h100-fp8-1p1d-tep16-tep16.yaml     | 129 ++++++++++++++++++
 .../disagg-h100-fp8-1p1d-tep16-tep16.yaml     | 111 +++++++++++++++
 runners/launch_h100-dgxc-slurm.sh             |   3 +
 4 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
 create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1a5b1de92..f737a6946 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2462,11 +2462,37 @@ dsv4-fp8-h100-dynamo-vllm:
   framework: dynamo-vllm
   multinode: true
   disagg: true
+  # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode
+  # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more
+  # workers). They differ in how each worker's 16 GPUs are organised:
+  #   - TEP (tp:16,ep:16,dp-attn:false): one replica spread across all 16
+  #     GPUs, experts sharded. Better latency at low conc — every request
+  #     gets the full compute. Cross-node TP is expensive per token but
+  #     wins when batch is small.
+  #   - DEP (tp:1,ep:16,dp-attn:true): 16 independent attention paths,
+  #     experts sharded. Higher batching capacity, better at high conc.
+  # Pattern follows dsr1-fp8-h100-dynamo-sglang's TEP/DEP split.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - conc-list: [4, 8, 16, 32, 64, 128]
+    # TEP — low conc (latency-bound)
+    - conc-list: [4, 8, 16, 32]
+      prefill:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+    # DEP — high conc (throughput-bound)
+    - conc-list: [64, 128, 256]
       prefill:
         num-worker: 1
         tp: 1
@@ -2483,7 +2509,23 @@ dsv4-fp8-h100-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    - conc-list: [4, 8, 16, 32, 64]
+    # TEP — low conc (latency-bound)
+    - conc-list: [4, 8, 16]
+      prefill:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+    # DEP — high conc (throughput-bound)
+    - conc-list: [32, 64, 128]
       prefill:
         num-worker: 1
         tp: 1
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
new file mode 100644
index 000000000..a06bf738b
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -0,0 +1,129 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant)
+#
+# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same 2P+2D footprint
+# (4 nodes total) but each worker uses tensor parallel across all 16 GPUs
+# instead of data parallel.
+#
+# - DEP variant (high conc): tp=1, dp=16, ep=16. 16 independent attention
+#   paths across the 16 GPUs, sharded experts. Maximises batching capacity.
+# - TEP variant (this file, low conc): tp=16, dp=1, ep=16. One replica
+#   spread across all 16 GPUs, with experts also sharded across the same
+#   ranks. All 16 GPUs cooperate on a single forward pass — gives lower
+#   latency at small batch sizes since each request gets all the compute.
+#
+# Cross-node TP=16 routes attention all-reduce + MoE all2all across the IB
+# fabric. That's expensive per-token but only meaningful at high
+# concurrency; at conc 4-32 the latency win from more compute-per-request
+# beats the IB overhead.
+#
+# Same DSV4 startup workarounds as the DEP recipe (alec-flowers fork
+# pinning ai-dynamo to 6a159fe, NCCL_SYMM_MEM, no all2all-backend, drop
+# API-server flags, gpu-memory-utilization 0.85 for indexer headroom).
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      data-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      # TEP serves smaller batches; cap max-num-seqs so cudagraph capture
+      # doesn't waste memory on shapes we won't hit at conc <= 32.
+      max-num-seqs: 64
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.85
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      data-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.85
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
new file mode 100644
index 000000000..9e82c02f3
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -0,0 +1,111 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) — 8k/1k
+#
+# See 1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml for the full DEP-vs-TEP
+# rationale and the DSV4 startup workarounds. This file just changes the
+# benchmark block to ISL=8192 and trims the conc list.
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      data-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.85
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      data-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.85
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 3a279b6d3..26eabad89 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -47,6 +47,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION"
             exit 1
         fi
+        # Verify the weights are staged and log their size (catches partial
+        # downloads / wrong revisions before we burn 8 min on weight load).
+        du -sh "$MODEL_PATH" 2>/dev/null || echo "WARNING: could not stat $MODEL_PATH"
     else
         echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm"
         exit 1

From 1bdeb9ef09b5b466c4ac601b210ba12b89fca7e1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:35:26 -0700
Subject: [PATCH 10/13] dsv4 h100 recipes: replace broken TEP with low-conc DEP
 variant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sweep 24921015519 surfaced that cross-node TP=16 doesn't work with the
Dynamo+vLLM stack:

  pydantic_core._pydantic_core.ValidationError: 1 validation error for ParallelConfig
    Value error, World size (16) is larger than the number of available
    GPUs (1) in this node. If this is intentional and you are using:
    - ray, set '--distributed-executor-backend ray'.
    - multiprocessing, set '--nnodes' appropriately.

Dynamo spawns one vLLM process per GPU; each process only sees its
single local GPU and vLLM rejects world_size=16. Working around this
would need --distributed-executor-backend=ray which Dynamo doesn't
coordinate. None of the working DSV4 vLLM recipes (kimi GB200, DSR1
H100, PR 1129 GB200) use cross-node TP either — the execution model
assumes one process per GPU.

So drop TEP entirely; instead deliver two DEP recipes per ISL/OSL
that differ in batch tuning:

  DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256,
    enforce-eager=true on decode (no cudagraph). Smaller cudagraph
    capture footprint, faster warmup, no decode kernel-launch
    optimization (irrelevant at conc<=32 where network round-trips
    dominate per-token latency).
  DEP (high conc, existing): max-num-seqs=512, max-num-batched-tokens
    =512, decode cudagraph enabled. Higher batching throughput at
    conc>=64.

Conc splits unchanged from previous attempt:
  1k1k eager [4,8,16,32]    1k1k dep [64,128,256]
  8k1k eager [4,8,16]       8k1k dep [32,64,128]

Same 4 matrix entries, all with the same tp=1/dp=16/ep=16/dp-attn=true
metadata; differentiation is via the CONFIG_FILE pointer in
additional-settings (mirrors how the trtllm dsr1-h100 recipes encode
multiple variants of the same topology).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 51 ++++++++--------
 ...sagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} | 58 ++++++++++---------
 ...sagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} | 26 +++++----
 3 files changed, 72 insertions(+), 63 deletions(-)
 rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/{disagg-h100-fp8-1p1d-tep16-tep16.yaml => disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} (62%)
 rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/{disagg-h100-fp8-1p1d-tep16-tep16.yaml => disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} (82%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f737a6946..c04c75d79 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2464,34 +2464,37 @@ dsv4-fp8-h100-dynamo-vllm:
   disagg: true
   # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode
   # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more
-  # workers). They differ in how each worker's 16 GPUs are organised:
-  #   - TEP (tp:16,ep:16,dp-attn:false): one replica spread across all 16
-  #     GPUs, experts sharded. Better latency at low conc — every request
-  #     gets the full compute. Cross-node TP is expensive per token but
-  #     wins when batch is small.
-  #   - DEP (tp:1,ep:16,dp-attn:true): 16 independent attention paths,
-  #     experts sharded. Higher batching capacity, better at high conc.
-  # Pattern follows dsr1-fp8-h100-dynamo-sglang's TEP/DEP split.
+  # workers). Cross-node TP=16 (TEP) was attempted but the Dynamo+vLLM
+  # stack only supports one process per GPU, and vLLM's argparse rejects
+  # world_size=16 with only 1 local GPU. Both recipes therefore share the
+  # DEP topology (tp=1, dp=16, ep=16, dp-attn=true) and differ in batch
+  # tuning:
+  #   - DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256,
+  #     enforce-eager=true on decode (no cudagraph). Smaller memory and
+  #     faster warmup; trades decode kernel-launch overhead that doesn't
+  #     matter at conc<=32.
+  #   - DEP (high conc): max-num-seqs=512, max-num-batched-tokens=512,
+  #     decode cudagraph enabled. Higher batch throughput.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    # TEP — low conc (latency-bound)
+    # DEP-eager — low conc (smaller batch, no decode cudagraph)
     - conc-list: [4, 8, 16, 32]
       prefill:
         num-worker: 1
-        tp: 16
+        tp: 1
         ep: 16
-        dp-attn: false
+        dp-attn: true
         additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml"
       decode:
         num-worker: 1
-        tp: 16
+        tp: 1
         ep: 16
-        dp-attn: false
-    # DEP — high conc (throughput-bound)
+        dp-attn: true
+    # DEP — high conc (throughput-bound, decode cudagraph)
     - conc-list: [64, 128, 256]
       prefill:
         num-worker: 1
@@ -2509,22 +2512,22 @@ dsv4-fp8-h100-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    # TEP — low conc (latency-bound)
+    # DEP-eager — low conc (smaller batch, no decode cudagraph)
     - conc-list: [4, 8, 16]
       prefill:
         num-worker: 1
-        tp: 16
+        tp: 1
         ep: 16
-        dp-attn: false
+        dp-attn: true
         additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml"
       decode:
         num-worker: 1
-        tp: 16
+        tp: 1
         ep: 16
-        dp-attn: false
-    # DEP — high conc (throughput-bound)
+        dp-attn: true
+    # DEP — high conc (throughput-bound, decode cudagraph)
     - conc-list: [32, 64, 128]
       prefill:
         num-worker: 1
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
similarity index 62%
rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
index a06bf738b..e7256c46a 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
@@ -1,26 +1,29 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant)
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP)
 #
-# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same 2P+2D footprint
-# (4 nodes total) but each worker uses tensor parallel across all 16 GPUs
-# instead of data parallel.
+# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same topology
+# (tp=1, dp=16, ep=16, dp-attn=true) and same 4-node footprint, but
+# tuned for low concurrency (4-32):
 #
-# - DEP variant (high conc): tp=1, dp=16, ep=16. 16 independent attention
-#   paths across the 16 GPUs, sharded experts. Maximises batching capacity.
-# - TEP variant (this file, low conc): tp=16, dp=1, ep=16. One replica
-#   spread across all 16 GPUs, with experts also sharded across the same
-#   ranks. All 16 GPUs cooperate on a single forward pass — gives lower
-#   latency at small batch sizes since each request gets all the compute.
+#   - max-num-seqs: 64 (vs 512). Engine never queues more than ~conc
+#     sequences at conc<=32, so the larger budget just wastes cudagraph
+#     capture memory.
+#   - max-num-batched-tokens: 256 (vs 512). Smaller prefill chunks =
+#     lower TTFT for sparse traffic. Throughput penalty is irrelevant
+#     at low conc.
+#   - enforce-eager: true on decode (vs cudagraph). Cudagraph capture
+#     reserves ~1-2 GiB per worker and adds ~30s warmup. At low conc
+#     the per-token kernel-launch overhead is dominated by network
+#     round-trips anyway, so eager mode is a fine tradeoff.
 #
-# Cross-node TP=16 routes attention all-reduce + MoE all2all across the IB
-# fabric. That's expensive per-token but only meaningful at high
-# concurrency; at conc 4-32 the latency win from more compute-per-request
-# beats the IB overhead.
+# Originally targeted as a TEP variant (tp=16) but Dynamo's vLLM workers
+# spawn one process per GPU and vLLM rejects world_size=16 with only 1
+# local GPU per process. Cross-node TP would need
+# --distributed-executor-backend=ray which Dynamo doesn't coordinate.
+# So we keep DEP topology and differentiate by batch tuning instead.
 #
-# Same DSV4 startup workarounds as the DEP recipe (alec-flowers fork
-# pinning ai-dynamo to 6a159fe, NCCL_SYMM_MEM, no all2all-backend, drop
-# API-server flags, gpu-memory-utilization 0.85 for indexer headroom).
+# All other DSV4 startup workarounds match the high-conc DEP recipe.
 
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16"
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager"
 
 model:
   path: "dsv4-fp8"
@@ -68,6 +71,7 @@ backend:
     VLLM_USE_DEEP_GEMM: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    VLLM_MOE_DP_CHUNK_SIZE: "192"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -86,16 +90,15 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 16
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 16384
-      # TEP serves smaller batches; cap max-num-seqs so cudagraph capture
-      # doesn't waste memory on shapes we won't hit at conc <= 32.
       max-num-seqs: 64
-      max-num-batched-tokens: 512
+      max-num-batched-tokens: 256
       gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
@@ -108,17 +111,18 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 16
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
+      enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 64
-      max-num-batched-tokens: 512
+      max-num-batched-tokens: 256
       gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
similarity index 82%
rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
index 9e82c02f3..e0b2af853 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
@@ -1,10 +1,9 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) — 8k/1k
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) — 8k/1k
 #
-# See 1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml for the full DEP-vs-TEP
-# rationale and the DSV4 startup workarounds. This file just changes the
-# benchmark block to ISL=8192 and trims the conc list.
+# See 1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml for full rationale.
+# Same low-conc-tuned DEP variant; only the benchmark block differs.
 
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k"
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager-8k1k"
 
 model:
   path: "dsv4-fp8"
@@ -52,6 +51,7 @@ backend:
     VLLM_USE_DEEP_GEMM: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    VLLM_MOE_DP_CHUNK_SIZE: "192"
     VLLM_SKIP_P2P_CHECK: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -70,14 +70,15 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 16
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 64
-      max-num-batched-tokens: 512
+      max-num-batched-tokens: 256
       gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
@@ -90,17 +91,18 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 16
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
+      enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 64
-      max-num-batched-tokens: 512
+      max-num-batched-tokens: 256
       gpu-memory-utilization: 0.85
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
 
 benchmark:
   type: "sa-bench"

From 17dcc847a5da533fce8027fe4e6a15a992138d15 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:38:15 -0700
Subject: [PATCH 11/13] dsv4 h100: revert to single high-conc DEP config
 (working from run 24914869373)

The eager low-conc DEP variant added in 1bdeb9ef was untested, and the
TEP variant before that didn't work at all on Dynamo+vLLM. Drop both
and revert to the single-DEP search-space form that successfully served
gsm8k eval-only in run 24914869373:

  1k1k DEP: conc [4, 8, 16, 32, 64, 128]
  8k1k DEP: conc [4, 8, 16, 32, 64]

Each entry uses tp=1, dp=16, ep=16, dp-attn=true (1P+1D filling the
4-node h100-multinode pool). max-num-seqs=512, decode cudagraph on,
gpu-memory-utilization=0.85.

Removes:
- benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
- benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  53 +------
 ...isagg-h100-fp8-1p1d-dep16-dep16-eager.yaml | 133 ------------------
 ...isagg-h100-fp8-1p1d-dep16-dep16-eager.yaml | 113 ---------------
 3 files changed, 6 insertions(+), 293 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
 delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c04c75d79..f3a0158d4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2462,40 +2462,15 @@ dsv4-fp8-h100-dynamo-vllm:
   framework: dynamo-vllm
   multinode: true
   disagg: true
-  # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode
-  # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more
-  # workers). Cross-node TP=16 (TEP) was attempted but the Dynamo+vLLM
-  # stack only supports one process per GPU, and vLLM's argparse rejects
-  # world_size=16 with only 1 local GPU. Both recipes therefore share the
-  # DEP topology (tp=1, dp=16, ep=16, dp-attn=true) and differ in batch
-  # tuning:
-  #   - DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256,
-  #     enforce-eager=true on decode (no cudagraph). Smaller memory and
-  #     faster warmup; trades decode kernel-launch overhead that doesn't
-  #     matter at conc<=32.
-  #   - DEP (high conc): max-num-seqs=512, max-num-batched-tokens=512,
-  #     decode cudagraph enabled. Higher batch throughput.
+  # 1P+1D DEP over the full 4-node h100-multinode pool (DSV4-Pro weights
+  # need >=2 nodes per side, so we cannot fit more workers). High-conc
+  # only — this is the configuration that successfully served eval-only
+  # gsm8k end-to-end in run 24914869373.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    # DEP-eager — low conc (smaller batch, no decode cudagraph)
-    - conc-list: [4, 8, 16, 32]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-        additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-    # DEP — high conc (throughput-bound, decode cudagraph)
-    - conc-list: [64, 128, 256]
+    - conc-list: [4, 8, 16, 32, 64, 128]
       prefill:
         num-worker: 1
         tp: 1
@@ -2512,23 +2487,7 @@ dsv4-fp8-h100-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    # DEP-eager — low conc (smaller batch, no decode cudagraph)
-    - conc-list: [4, 8, 16]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-        additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-    # DEP — high conc (throughput-bound, decode cudagraph)
-    - conc-list: [32, 64, 128]
+    - conc-list: [4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
         tp: 1
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
deleted file mode 100644
index e7256c46a..000000000
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
+++ /dev/null
@@ -1,133 +0,0 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP)
-#
-# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same topology
-# (tp=1, dp=16, ep=16, dp-attn=true) and same 4-node footprint, but
-# tuned for low concurrency (4-32):
-#
-#   - max-num-seqs: 64 (vs 512). Engine never queues more than ~conc
-#     sequences at conc<=32, so the larger budget just wastes cudagraph
-#     capture memory.
-#   - max-num-batched-tokens: 256 (vs 512). Smaller prefill chunks =
-#     lower TTFT for sparse traffic. Throughput penalty is irrelevant
-#     at low conc.
-#   - enforce-eager: true on decode (vs cudagraph). Cudagraph capture
-#     reserves ~1-2 GiB per worker and adds ~30s warmup. At low conc
-#     the per-token kernel-launch overhead is dominated by network
-#     round-trips anyway, so eager mode is a fine tradeoff.
-#
-# Originally targeted as a TEP variant (tp=16) but Dynamo's vLLM workers
-# spawn one process per GPU and vLLM rejects world_size=16 with only 1
-# local GPU per process. Cross-node TP would need
-# --distributed-executor-backend=ray which Dynamo doesn't coordinate.
-# So we keep DEP topology and differentiate by batch tuning instead.
-#
-# All other DSV4 startup workarounds match the high-conc DEP recipe.
-
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager"
-
-model:
-  path: "dsv4-fp8"
-  container: "vllm/vllm-openai:deepseekv4-cu129"
-  precision: "fp8"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-resources:
-  gpu_type: "h100"
-  gpus_per_node: 8
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 16
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: nixl
-
-  prefill_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  decode_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_MOE_DP_CHUNK_SIZE: "192"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 16384
-      max-num-seqs: 64
-      max-num-batched-tokens: 256
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 16384
-      max-num-seqs: 64
-      max-num-batched-tokens: 256
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
deleted file mode 100644
index e0b2af853..000000000
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) — 8k/1k
-#
-# See 1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml for full rationale.
-# Same low-conc-tuned DEP variant; only the benchmark block differs.
-
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager-8k1k"
-
-model:
-  path: "dsv4-fp8"
-  container: "vllm/vllm-openai:deepseekv4-cu129"
-  precision: "fp8"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-resources:
-  gpu_type: "h100"
-  gpus_per_node: 8
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 16
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: nixl
-
-  prefill_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  decode_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_MOE_DP_CHUNK_SIZE: "192"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 16384
-      max-num-seqs: 64
-      max-num-batched-tokens: 256
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 16384
-      max-num-seqs: 64
-      max-num-batched-tokens: 256
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16"
-  req_rate: "inf"
-  use_chat_template: false

From 3cfdb7b468ee7b5b868060a0e200b0fa83032aae Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:23:14 -0700
Subject: [PATCH 12/13] Bump dsv4 H100 health_check timeout to 4h, slurm
 time_limit to 8h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 24922713022 hit the default 1800s orchestrator deadline on all three
matrix jobs (1k1k bench, 8k1k bench, 8k1k eval). Concurrent multinode
matrix jobs starve the same Lustre OSTs — first shard load took 423s,
shard 8/64 was reached at 16 min, projected total weight load ~107 min.

Match the GB200 dsv4 recipes which already added these blocks for the
same reason.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml  | 17 +++++++++++++++++
 .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml  | 10 ++++++++++
 2 files changed, 27 insertions(+)

diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 90f5938a7..5a5164072 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -32,6 +32,23 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Bumped from the srt-slurm 6h default. Three multinode matrix jobs (1k1k
+# bench, 8k1k bench, 8k1k eval) launch concurrently from the run-sweep
+# workflow and all read the same /mnt/nfs/lustre/models/dsv4-fp8 — combined
+# with cudagraph capture this can exceed the default wall clock.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24922713022
+# saw shard 1/64 take 423s and shard 8/64 reach only 16 min into the load —
+# the 32 H100 workers per job × 3 concurrent matrix jobs starve the same
+# Lustre OSTs. Default 1800s deadline fired before any job became healthy.
+# Match the GB200 dsv4 recipes; over-long deadline just idles, doesn't burn
+# compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
 resources:
   gpu_type: "h100"
   gpus_per_node: 8
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
index 5cb4ff084..ef9a8224e 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
@@ -23,6 +23,16 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# See 1k1k recipe for slurm.time_limit and health_check rationale — three
+# concurrent matrix jobs starve the same Lustre OSTs and the default 1800s
+# orchestrator deadline fires before any job becomes healthy.
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
 resources:
   gpu_type: "h100"
   gpus_per_node: 8

From f798361a93a6b4e46260c4ffd693de58206b2ca5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 23:34:39 -0700
Subject: [PATCH 13/13] Switch dsv4 H100 disagg from DP=16 to cross-node TP=16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DSV4-Pro per-rank weights are 74.99 GiB at DP=16/EP=16 — H100 80GB
leaves only ~4 GiB headroom and sparse_attn_indexer's profile_run
torch.empty(512 MiB) OOMs (run 24923521075).

Cross-node TP=16 shards the model 16-way across 2 nodes (~5 GiB per
rank). srt-slurm's vllm.py:386-388 emits --headless on the secondary
node when data-parallel-size is absent and the worker spans nodes;
Dynamo's run_dynamo_headless calls vLLM's run_headless which uses
MultiprocExecutor + torch.distributed (no Ray) to form the cross-node
PG. NCCL TP all-reduce flows over IB on every layer — slower per-token
than intra-node NVLink, but the only way to fit DSV4-Pro at 80 GB.

Other changes: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True for
fragmentation; gpu-memory-utilization back to 0.95 (matches H200);
enforce-eager on decode for the first attempt (cross-node cudagraphs
are fragile).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  37 ++--
 .../disagg-h100-fp8-1p1d-dep16-dep16.yaml     | 171 ----------------
 .../disagg-h100-fp8-1p1d-tep16-tep16.yaml     | 183 ++++++++++++++++++
 ... => disagg-h100-fp8-1p1d-tep16-tep16.yaml} |  47 ++---
 4 files changed, 215 insertions(+), 223 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
 rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/{disagg-h100-fp8-1p1d-dep16-dep16.yaml => disagg-h100-fp8-1p1d-tep16-tep16.yaml} (57%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0c8457169..1182a0070 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2450,8 +2450,10 @@ dsv4-fp8-h200-vllm:
 
 # DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm.
 # 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool).
-# Minimum viable disagg shape: DSV4-Pro FP8 weights (~862 GB) don't fit on one
-# H100 node (8x80GB=640GB), so each side must own >=2 nodes. Recipes bundled
+# Cross-node TP=16 over IB: DSV4-Pro per-rank weight footprint at DP=16/EP=16
+# is 74.99 GiB on H100 80GB (run 24923521075 OOM'd in sparse_attn_indexer
+# profile_run with only ~4 GiB headroom). TP=16 shards the model 16-way
+# across the 2 nodes, dropping per-rank weights to ~5 GiB. Recipe bundled
 # locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed.
 dsv4-fp8-h100-dynamo-vllm:
   image: vllm/vllm-openai:deepseekv4-cu129
@@ -2462,10 +2464,9 @@ dsv4-fp8-h100-dynamo-vllm:
   framework: dynamo-vllm
   multinode: true
   disagg: true
-  # 1P+1D DEP over the full 4-node h100-multinode pool (DSV4-Pro weights
-  # need >=2 nodes per side, so we cannot fit more workers). High-conc
-  # only — this is the configuration that successfully served eval-only
-  # gsm8k end-to-end in run 24914869373.
+  # 1P+1D TEP=16 across the full 4-node h100-multinode pool.
+  # Each prefill/decode worker spans 2 nodes via Dynamo's --headless
+  # secondary-node mode + vLLM's MultiprocExecutor + torch.distributed PG.
   seq-len-configs:
   - isl: 1024
     osl: 1024
@@ -2473,34 +2474,34 @@ dsv4-fp8-h100-dynamo-vllm:
     - conc-list: [4, 8, 16, 32, 64, 128]
       prefill:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
-        dp-attn: true
+        dp-attn: false
         additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml"
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
-        dp-attn: true
+        dp-attn: false
   - isl: 8192
     osl: 1024
     search-space:
     - conc-list: [4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
-        dp-attn: true
+        dp-attn: false
         additional-settings:
-        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml"
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
-        dp-attn: true
+        dp-attn: false
 
 # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300
 # pareto sweep. The single-node schema has no explicit data-parallel-size
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
deleted file mode 100644
index 5a5164072..000000000
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ /dev/null
@@ -1,171 +0,0 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU)
-#
-# Forked from NVIDIA/srt-slurm recipes/vllm/deepseek-r1/disagg-h100-16gpu.yaml.
-# Engine flags updated to match the single-node H200 DSV4 recipe (deepseek_v4
-# tokenizer/parsers, FP8 KV cache, block_size=256, prefix caching disabled,
-# compilation mode 0). Kept from DSR1: NixlConnector P<->D KV transfer,
-# VLLM_MOE_DP_CHUNK_SIZE=192 for H100 80GB (vs H200 141GB default of 384),
-# deepep all2all backends, VLLM_USE_DEEP_GEMM.
-#
-# max-model-len is 16384, not H200's 800000 — KV for 800k context does not
-# fit across two 80GB decode nodes.
-#
-# DP+EP configuration:
-# - Each GPU runs its own vLLM process (tensor-parallel-size: 1)
-# - 1 prefill endpoint x 16 GPUs (2 nodes, DP16) -> 16 prefill processes
-# - 1 decode endpoint  x 16 GPUs (2 nodes, DP16) -> 16 decode processes
-# - Total: 32 GPUs across 4 nodes (fills the h100-multinode pool)
-
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16"
-
-model:
-  path: "dsv4-fp8"
-  container: "vllm/vllm-openai:deepseekv4-cu129"
-  precision: "fp8"
-
-# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
-# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71),
-# which extends the dynamo config to accept `hash` as well as `version`.
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-# Bumped from the srt-slurm 6h default. Three multinode matrix jobs (1k1k
-# bench, 8k1k bench, 8k1k eval) launch concurrently from the run-sweep
-# workflow and all read the same /mnt/nfs/lustre/models/dsv4-fp8 — combined
-# with cudagraph capture this can exceed the default wall clock.
-slurm:
-  time_limit: "8:00:00"
-
-# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24922713022
-# saw shard 1/64 take 423s and shard 8/64 reach only 16 min into the load —
-# the 32 H100 workers per job × 3 concurrent matrix jobs starve the same
-# Lustre OSTs. Default 1800s deadline fired before any job became healthy.
-# Match the GB200 dsv4 recipes; over-long deadline just idles, doesn't burn
-# compute.
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "h100"
-  gpus_per_node: 8
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 16
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: nixl
-
-  # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL
-  # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM
-  # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp
-  # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is
-  # the companion flag. Matches PR 1129 GB200 (where deepep's NVSHMEM has
-  # the same issue).
-  prefill_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  decode_environment:
-    VLLM_USE_DEEP_GEMM: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    VLLM_MOE_DP_CHUNK_SIZE: "192"
-    VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_SERVER_DEV_MODE: "1"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    NVIDIA_GDRCOPY: "enabled"
-    GLOO_SOCKET_IFNAME: "eth0"
-    PYTHONUNBUFFERED: "1"
-
-  vllm_config:
-    # Tokenizer mode, tool-call parser, reasoning parser, and
-    # enable-auto-tool-choice are OpenAI API-server flags; Dynamo is the
-    # frontend in this disagg setup and handles tool/reasoning parsing
-    # itself. The vLLM workers are engine-only processes and their argparse
-    # rejects those flags (matches kimi-k2.5 recipe which omits them too).
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 16384
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer
-      # transients on top of vLLM's reserved KV/weights/activations. The H200
-      # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB
-      # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and
-      # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB.
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      kv-cache-dtype: "fp8"
-      block-size: 256
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 16384
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer
-      # transients on top of vLLM's reserved KV/weights/activations. The H200
-      # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB
-      # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and
-      # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB.
-      gpu-memory-utilization: 0.85
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64x128"
-  req_rate: "inf"
-  # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The
-  # server uses --tokenizer-mode deepseek_v4 to handle templating itself,
-  # but sa-bench's local apply_chat_template path raises ValueError.
-  # Send raw prompts; the server handles formatting.
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
new file mode 100644
index 000000000..049fa8218
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -0,0 +1,183 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16)
+#
+# DSV4-Pro FP8 weights are 74.99 GiB per DP rank — DP=16/EP=16 (one process
+# per GPU) leaves only ~4 GiB headroom on H100 80GB and sparse_attn_indexer
+# OOMs during profile_run (run 24923521075 hit this on every prefill and
+# decode worker simultaneously). Switching to cross-node TP=16 shards
+# dense+expert weights 16-way → ~5 GiB/GPU, plenty of headroom.
+#
+# Layout:
+# - 1 prefill endpoint × TP=16 across 2 nodes (16 GPUs)
+# - 1 decode endpoint  × TP=16 across 2 nodes (16 GPUs)
+# - Total: 32 GPUs across 4 nodes (fills h100-multinode pool)
+#
+# How the cross-node launch works:
+# - srt-slurm's standard TP mode (no data-parallel-size) starts one srun
+#   per node, each process gets all 8 local GPUs.
+# - Leader (node_rank=0): full Dynamo + vLLM with `--master-addr <leader_ip>
+#   --nnodes 2 --node-rank 0 --tensor-parallel-size 16`. MultiprocExecutor
+#   spawns 8 local workers, then waits for the 8 remote workers.
+# - Secondary (node_rank=1): adds `--headless` (srt-slurm vllm.py:386-388),
+#   which routes through dynamo's run_dynamo_headless → vLLM's run_headless →
+#   MultiprocExecutor(monitor_workers=False) joining the leader's PG over
+#   torch.distributed (master_addr / master_port). NCCL backs the TP
+#   all-reduce; on h100-multinode that flows over IB (no NVLink between
+#   nodes), so per-layer TP comms is the dominant latency cost — accepted
+#   as the only way to fit DSV4-Pro on 80 GB H100.
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
+# wheel AND that exposes --headless via run_dynamo_headless (see
+# components/src/dynamo/vllm/main.py:80). Requires the alec-flowers/srt-slurm
+# fork (NVIDIA/srt-slurm#71), which extends the dynamo config to accept
+# `hash` as well as `version`.
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+# Bumped from the srt-slurm 6h default. Cross-node TP=16 over IB plus
+# cudagraph capture extends post-load init noticeably; cold-cache Lustre
+# weight load alone took 24 min in run 24923521075.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24923521075
+# observed full weight load taking ~1450s with three concurrent matrix
+# jobs starving the same Lustre OSTs. Cross-node TP setup + indexer
+# warm-up adds more on top. Match the GB200 dsv4 recipes; the cost of an
+# over-long deadline is sitting idle, not wasted compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL
+  # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM
+  # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp
+  # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is
+  # the companion flag.
+  #
+  # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True reduces fragmentation
+  # in the tight headroom — even with TP=16 sharding the model 16-way, a
+  # contiguous 512 MiB indexer scratch can still fail under fragmented
+  # 80 GB caches. The OOM error message itself recommends this exact flag.
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  vllm_config:
+    # tensor-parallel-size: 16 across 2 nodes (each srt-slurm srun process
+    # gets 8 local GPUs; vLLM forms a 16-rank PG via the master_addr
+    # handshake). data-parallel-size is intentionally absent — its presence
+    # would push srt-slurm into the per-GPU process layout, which is what
+    # broke the previous TEP=16 attempt with "World size (16) > available
+    # GPUs (1) in this node".
+    #
+    # enable-expert-parallel keeps experts sharded EP=16 along the same
+    # 16 ranks (each rank holds 1/16 of the routed experts). Communication
+    # is dominated by per-layer TP all-reduce + EP all-to-all, both over
+    # IB.
+    #
+    # enforce-eager on both sides for the first cross-node attempt: cudagraph
+    # capture across nodes is fragile (FULL_DECODE_ONLY graphs include the
+    # cross-node TP all-reduce) and the previous run's 1.48 GiB private-pool
+    # accumulation already burned the headroom. Drop graphs to ship; revisit
+    # for decode performance once the server is observed healthy.
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      # With TP=16 the per-rank model footprint drops from ~75 GiB to
+      # ~5 GiB, so we can match the H200 single-node 0.95 utilization.
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The
+  # server uses --tokenizer-mode deepseek_v4 to handle templating itself,
+  # but sa-bench's local apply_chat_template path raises ValueError.
+  # Send raw prompts; the server handles formatting.
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
similarity index 57%
rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
index ef9a8224e..e31e8d531 100644
--- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -1,31 +1,26 @@
-# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) — 8k/1k
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) — 8k/1k
 #
 # Same engine flags as the 1k1k variant. Only the benchmark block differs
 # (ISL=8192, tighter concurrency sweep due to larger prefill work).
 #
-# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml
-# for the full rationale (FP8 KV cache, block_size=256, deepseek_v4 parsers,
-# NixlConnector, H100-tuned VLLM_MOE_DP_CHUNK_SIZE=192).
+# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+# for the full rationale (cross-node TP=16 layout, MultiprocExecutor +
+# torch.distributed PG handshake, --headless secondary node, IB-backed
+# NCCL TP all-reduce, deepseek_v4 parsers, NixlConnector).
 
-name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-8k1k"
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k"
 
 model:
   path: "dsv4-fp8"
   container: "vllm/vllm-openai:deepseekv4-cu129"
   precision: "fp8"
 
-# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
-# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71),
-# which extends the dynamo config to accept `hash` as well as `version`.
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
 
 setup_script: vllm-container-deps.sh
 
-# See 1k1k recipe for slurm.time_limit and health_check rationale — three
-# concurrent matrix jobs starve the same Lustre OSTs and the default 1800s
-# orchestrator deadline fires before any job becomes healthy.
 slurm:
   time_limit: "8:00:00"
 
@@ -51,39 +46,33 @@ backend:
   type: vllm
   connector: nixl
 
-  # See 1k1k recipe for the rationale behind VLLM_USE_NCCL_SYMM_MEM /
-  # NCCL_CUMEM_ENABLE — the DSV4 wheel's NVSHMEM fails IPC bootstrap.
   prefill_environment:
     VLLM_USE_DEEP_GEMM: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_SERVER_DEV_MODE: "1"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
 
   decode_environment:
     VLLM_USE_DEEP_GEMM: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    VLLM_MOE_DP_CHUNK_SIZE: "192"
     VLLM_SKIP_P2P_CHECK: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_SERVER_DEV_MODE: "1"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     NVIDIA_GDRCOPY: "enabled"
     GLOO_SOCKET_IFNAME: "eth0"
     PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
 
   vllm_config:
-    # API-server flags (tokenizer-mode, tool-call-parser, reasoning-parser,
-    # enable-auto-tool-choice) are handled by Dynamo frontend, not the
-    # vLLM engine workers. See 1k1k recipe for rationale.
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
@@ -92,18 +81,14 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
+      tensor-parallel-size: 16
       pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom
-      # for DSV4's sparse attention indexer.
-      gpu-memory-utilization: 0.85
+      gpu-memory-utilization: 0.95
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
@@ -115,20 +100,16 @@ backend:
       block-size: 256
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
-      tensor-parallel-size: 1
+      tensor-parallel-size: 16
       pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
+      enforce-eager: true
       max-model-len: 16384
       max-num-seqs: 512
       max-num-batched-tokens: 512
-      # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom
-      # for DSV4's sparse attention indexer.
-      gpu-memory-utilization: 0.85
+      gpu-memory-utilization: 0.95
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}'
 
 benchmark:
   type: "sa-bench"
@@ -136,6 +117,4 @@ benchmark:
   osl: 1024
   concurrencies: "4x8x16x32x64"
   req_rate: "inf"
-  # See 1k1k recipe rationale — DSV4-Pro tokenizer has no chat_template;
-  # the server handles formatting via --tokenizer-mode deepseek_v4.
   use_chat_template: false