From 7dae672d3b5271ccdf1bab5f9c5b6f190abcebc8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 13:07:01 -0700 Subject: [PATCH 01/13] Add H100 config: dsv4-fp8-dynamo-vllm (DeepSeek-V4-Pro multinode disagg) Port the DSV4-Pro vLLM recipe from single-node H200 to H100 as multinode disaggregated serving via Dynamo. The ~862 GB FP8 weights don't fit on one 8xH100-80GB node (640 GB), so each side must own >=2 nodes; with the h100-multinode pool at 4 nodes, 2P+2D DP16/EP16 per side (32 H100s total) is the minimum viable shape and fills the pool exactly. Engine flags match the single-node H200 recipe: deepseek_v4 tokenizer, tool-call, and reasoning parsers; FP8 KV cache; block size 256; prefix caching disabled; compilation mode 0 with FULL_DECODE_ONLY cudagraph. max-model-len is capped at 16384 (H200's 800k does not fit KV across two 80GB decode nodes). Keeps H100-tuned knobs from the DSR1 vLLM recipe: VLLM_MOE_DP_CHUNK_SIZE=192, deepep_{high_throughput,low_latency} all2all backends, NixlConnector P<->D KV transfer, VLLM_USE_DEEP_GEMM, dynamo 1.0.1. srt-slurm recipes are bundled locally at benchmarks/multi_node/srt_slurm_recipes/ and overlaid onto the srt-slurm clone at runtime. This is temporary until the recipes can be upstreamed to NVIDIA/srt-slurm. Changes: - recipes: benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/ {1k1k,8k1k}/disagg-h100-fp8-1p1d-dep16-dep16.yaml - runner: launch_h100-dgxc-slurm.sh gains a dynamo-vllm framework branch (dsv4-fp8 model path at /mnt/nfs/lustre/models/dsv4-fp8, vLLM container squash mapping, srtslurm.yaml dynamo-vllm alias) and an unconditional local-recipes overlay after the srt-slurm checkout - master: .github/configs/nvidia-master.yaml adds dsv4-fp8-h100-dynamo-vllm with 1k1k conc [4,8,16,32,64,128] and 8k1k conc [4,8,16,32,64] Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 50 +++++++ .../disagg-h100-fp8-1p1d-dep16-dep16.yaml | 123 ++++++++++++++++++ .../disagg-h100-fp8-1p1d-dep16-dep16.yaml | 114 ++++++++++++++++ perf-changelog.yaml | 14 ++ runners/launch_h100-dgxc-slurm.sh | 24 +++- 5 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 96273444f..bc1a871ff 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2384,6 +2384,56 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm. +# 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool). +# Minimum viable disagg shape: DSV4-Pro FP8 weights (~862 GB) don't fit on one +# H100 node (8x80GB=640GB), so each side must own >=2 nodes. Recipes bundled +# locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed. +dsv4-fp8-h100-dynamo-vllm: + image: vllm/vllm-openai:deepseekv4-cu129 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h100-multinode + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml" + decode: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml" + decode: + num-worker: 1 + tp: 1 + ep: 16 + dp-attn: true + qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml new file mode 100644 index 000000000..1aaaf65e3 --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -0,0 +1,123 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) +# +# Forked from NVIDIA/srt-slurm recipes/vllm/deepseek-r1/disagg-h100-16gpu.yaml. +# Engine flags updated to match the single-node H200 DSV4 recipe (deepseek_v4 +# tokenizer/parsers, FP8 KV cache, block_size=256, prefix caching disabled, +# compilation mode 0). Kept from DSR1: NixlConnector P<->D KV transfer, +# VLLM_MOE_DP_CHUNK_SIZE=192 for H100 80GB (vs H200 141GB default of 384), +# deepep all2all backends, VLLM_USE_DEEP_GEMM. +# +# max-model-len is 16384, not H200's 800000 — KV for 800k context does not +# fit across two 80GB decode nodes. +# +# DP+EP configuration: +# - Each GPU runs its own vLLM process (tensor-parallel-size: 1) +# - 1 prefill endpoint x 16 GPUs (2 nodes, DP16) -> 16 prefill processes +# - 1 decode endpoint x 16 GPUs (2 nodes, DP16) -> 16 decode processes +# - Total: 32 GPUs across 4 nodes (fills the h100-multinode pool) + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_MOE_DP_CHUNK_SIZE: "192" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + vllm_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + reasoning-parser: "deepseek_v4" + enable-auto-tool-choice: true + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + all2all-backend: "deepep_high_throughput" + data-parallel-hybrid-lb: true + tensor-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + async-scheduling: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + reasoning-parser: "deepseek_v4" + enable-auto-tool-choice: true + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + all2all-backend: "deepep_low_latency" + data-parallel-hybrid-lb: true + tensor-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + async-scheduling: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml new file mode 100644 index 000000000..471efcb5b --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -0,0 +1,114 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) — 8k/1k +# +# Same engine flags as the 1k1k variant. Only the benchmark block differs +# (ISL=8192, tighter concurrency sweep due to larger prefill work). +# +# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +# for the full rationale (FP8 KV cache, block_size=256, deepseek_v4 parsers, +# NixlConnector, H100-tuned VLLM_MOE_DP_CHUNK_SIZE=192). + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-8k1k" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_MOE_DP_CHUNK_SIZE: "192" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + vllm_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + reasoning-parser: "deepseek_v4" + enable-auto-tool-choice: true + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + all2all-backend: "deepep_high_throughput" + data-parallel-hybrid-lb: true + tensor-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + async-scheduling: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + reasoning-parser: "deepseek_v4" + enable-auto-tool-choice: true + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + all2all-backend: "deepep_low_latency" + data-parallel-hybrid-lb: true + tensor-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + async-scheduling: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b2e138c8..d951b92da 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1745,3 +1745,17 @@ - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 + +- config-keys: + - dsv4-fp8-h100-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP8 H100 multinode disagg benchmark via dynamo-vllm" + - "2 prefill nodes + 2 decode nodes (32 H100s total, DP16/EP16 per side)" + - "Image: vllm/vllm-openai:deepseekv4-cu129" + - "Engine flags match H200 single-node recipe (deepseek_v4 tokenizer/parsers, FP8 KV cache, block size 256, prefix caching disabled)" + - "max-model-len 16384 (H100 80GB KV headroom; H200's 800k does not fit across 2 decode nodes)" + - "VLLM_MOE_DP_CHUNK_SIZE=192 and deepep_{high_throughput,low_latency} all2all backends tuned for H100 80GB" + - "NixlConnector P<->D KV transfer, dynamo 1.0.1" + - "srt-slurm recipes bundled locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed" + - "Configs: 1k1k conc 4-128, 8k1k conc 4-64" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 5a2ab64d2..33efc3e6c 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -29,8 +29,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "Unsupported model prefix/precision for dynamo-trt: $MODEL_PREFIX/$PRECISION" exit 1 fi + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8" + export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro" + export SRT_SLURM_MODEL_PREFIX="dsv4-fp8" + else + echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION" + exit 1 + fi else - echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" + echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" exit 1 fi @@ -45,6 +54,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 + # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until + # the upstream PR lands; cp -r merges directories on GNU cp. + LOCAL_RECIPES_DIR="$GITHUB_WORKSPACE/benchmarks/multi_node/srt_slurm_recipes" + if [ -d "$LOCAL_RECIPES_DIR" ]; then + echo "Overlaying local srt-slurm recipes from $LOCAL_RECIPES_DIR" + cp -r "$LOCAL_RECIPES_DIR"/* recipes/ + fi + echo "Installing srtctl..." export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" export UV_CACHE_DIR="/mnt/nfs/sa-shared/.uv/cache" @@ -78,6 +95,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#) CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|') SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh" + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + # vLLM container mapping - IMAGE is a Docker Hub reference (no registry prefix swap) + CONTAINER_KEY="$IMAGE" + SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh" fi export ISL="$ISL" @@ -105,6 +126,7 @@ model_paths: containers: dynamo-trtllm: "${SQUASH_FILE}" dynamo-sglang: "${SQUASH_FILE}" + dynamo-vllm: "${SQUASH_FILE}" nginx-sqsh: "${NGINX_SQUASH_FILE}" latest: "${SQUASH_FILE}" "${CONTAINER_KEY}": "${SQUASH_FILE}" From 0cd54afb3495a3f5701d796f69f3f693f1d6a8d9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 13:07:43 -0700 Subject: [PATCH 02/13] Update perf-changelog pr-link to PR 1142 Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d951b92da..af66beced 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1758,4 +1758,4 @@ - "NixlConnector P<->D KV transfer, dynamo 1.0.1" - "srt-slurm recipes bundled locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed" - "Configs: 1k1k conc 4-128, 8k1k conc 4-64" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1142 From 88b80c01f1c403e5ee0e225d69daa420b1ad5ee5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 13:31:42 -0700 Subject: [PATCH 03/13] launch_h100: pre-create logs dir and tar outputs/ on early failure Sweep 24909864822 had all three multinode jobs fail in 6s with ExitCode=1:0 and no sweep_JOBID.log written, leaving no usable diagnostic in the CI artifact. Two defensive changes: 1. mkdir -p outputs/$JOB_ID/logs before polling, so Slurm's #SBATCH --output=outputs/%j/logs/sweep_%j.log directive can open the target file even when the compute-node stepd lacks permission to create the parent dir on NFS. 2. On the "job failed before creating log file" path, tar outputs/$JOB_ID/ (sbatch_script.sh, config.yaml, any partial log, and the scontrol dump) into multinode_server_logs.tar.gz so the CI artifact captures what was submitted and why Slurm exited early. Previously exit 1 ran before the tar step. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_h100-dgxc-slurm.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 33efc3e6c..c75507e33 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -178,11 +178,21 @@ EOF LOGS_DIR="outputs/$JOB_ID/logs" LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + # Defensive: pre-create the logs subdir so Slurm's #SBATCH --output=... + # /%j/logs/sweep_%j.log can open the target file even on NFS mounts + # where the compute-node Slurm stepd lacks permission to mkdir -p. + mkdir -p "$LOGS_DIR" 2>/dev/null || true + # Wait for log file to appear (also check job is still alive) while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" + scontrol show job "$JOB_ID" | tee "outputs/$JOB_ID/scontrol_show_job.txt" 2>/dev/null + # Preserve sbatch_script.sh, config.yaml, metadata, and any partial + # log so the failure can be diagnosed from the CI artifact. + if [ -d "outputs/$JOB_ID" ]; then + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "outputs/$JOB_ID" . + fi exit 1 fi echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." From e0359c67db71b53697ca128ad2669862f18f8458 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 13:59:08 -0700 Subject: [PATCH 04/13] Fix dsv4 dynamo-vllm: switch to alec-flowers/srt-slurm@PR71 fork PR 1142's first real sweep hit "ModuleNotFoundError: No module named 'vllm.inputs.data'" on all three multinode jobs. Same error as PR 1129 on GB200. Root cause: ai-dynamo 1.0.1 (installed by NVIDIA/srt-slurm@sa-submission-q2-2026 via `dynamo: { version: 1.0.1 }`) imports vllm.inputs.data.TokensPrompt, a path removed in the DSV4 vLLM wheel. Dynamo workers crash during import before any vLLM flag matters. Fix, mirroring PR 1129: - launch_h100-dgxc-slurm.sh: override srt-slurm clone URL/ref via SRT_SLURM_REPO_URL and SRT_SLURM_REF env vars, set to alec-flowers/srt-slurm@d60e3f1c (head of NVIDIA/srt-slurm#71) for dynamo-vllm+dsv4. All other frameworks/models keep NVIDIA upstream. - Recipes: replace `dynamo.version: 1.0.1` with `dynamo.hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b`. The fork's schema accepts `hash:` for pinning a specific ai-dynamo/dynamo commit. That commit has the matching vllm.inputs import path. - Recipes: adopt DSV4-specific flags PR 1129 proved necessary for startup: `enforce-eager: true` (prefill only), `enable-sleep-mode: true`, `no-disable-hybrid-kv-cache-manager: true`, explicit `kv-transfer-config` (NixlConnector kv_both), env vars VLLM_SERVER_DEV_MODE=1 and TILELANG_CLEANUP_TEMP_FILES=1. - Recipes: drop `data-parallel-hybrid-lb` and `async-scheduling` (DSR1 patterns that PR 1129 omitted on DSV4; keep minimal delta from DSV4 H200 single-node). Kept H100-specific knobs: VLLM_MOE_DP_CHUNK_SIZE=192, deepep_{high_throughput, low_latency} all2all backends, VLLM_USE_DEEP_GEMM. Skipped GB200-only flags (NCCL_MNNVL_ENABLE, NCCL_NVLS_ENABLE, VLLM_USE_NCCL_SYMM_MEM). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../disagg-h100-fp8-1p1d-dep16-dep16.yaml | 23 ++++++++++++++----- .../disagg-h100-fp8-1p1d-dep16-dep16.yaml | 23 ++++++++++++++----- runners/launch_h100-dgxc-slurm.sh | 13 +++++++++-- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 1aaaf65e3..fb982b757 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -23,8 +23,11 @@ model: container: "vllm/vllm-openai:deepseekv4-cu129" precision: "fp8" +# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM +# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71), +# which extends the dynamo config to accept `hash` as well as `version`. dynamo: - version: 1.0.1 + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true setup_script: vllm-container-deps.sh @@ -52,6 +55,8 @@ backend: VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" @@ -62,12 +67,15 @@ backend: VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" vllm_config: prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" tokenizer-mode: "deepseek_v4" tool-call-parser: "deepseek_v4" @@ -79,19 +87,21 @@ backend: no-enable-prefix-caching: true no-enable-flashinfer-autotune: true all2all-backend: "deepep_high_throughput" - data-parallel-hybrid-lb: true tensor-parallel-size: 1 + pipeline-parallel-size: 1 data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true + enforce-eager: true max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 gpu-memory-utilization: 0.95 - async-scheduling: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" tokenizer-mode: "deepseek_v4" tool-call-parser: "deepseek_v4" @@ -103,8 +113,8 @@ backend: no-enable-prefix-caching: true no-enable-flashinfer-autotune: true all2all-backend: "deepep_low_latency" - data-parallel-hybrid-lb: true tensor-parallel-size: 1 + pipeline-parallel-size: 1 data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true @@ -112,7 +122,8 @@ backend: max-num-seqs: 512 max-num-batched-tokens: 512 gpu-memory-utilization: 0.95 - async-scheduling: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' benchmark: diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 471efcb5b..8e194e719 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -14,8 +14,11 @@ model: container: "vllm/vllm-openai:deepseekv4-cu129" precision: "fp8" +# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM +# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71), +# which extends the dynamo config to accept `hash` as well as `version`. dynamo: - version: 1.0.1 + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true setup_script: vllm-container-deps.sh @@ -43,6 +46,8 @@ backend: VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" @@ -53,12 +58,15 @@ backend: VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" vllm_config: prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" tokenizer-mode: "deepseek_v4" tool-call-parser: "deepseek_v4" @@ -70,19 +78,21 @@ backend: no-enable-prefix-caching: true no-enable-flashinfer-autotune: true all2all-backend: "deepep_high_throughput" - data-parallel-hybrid-lb: true tensor-parallel-size: 1 + pipeline-parallel-size: 1 data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true + enforce-eager: true max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 gpu-memory-utilization: 0.95 - async-scheduling: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" tokenizer-mode: "deepseek_v4" tool-call-parser: "deepseek_v4" @@ -94,8 +104,8 @@ backend: no-enable-prefix-caching: true no-enable-flashinfer-autotune: true all2all-backend: "deepep_low_latency" - data-parallel-hybrid-lb: true tensor-parallel-size: 1 + pipeline-parallel-size: 1 data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true @@ -103,7 +113,8 @@ backend: max-num-seqs: 512 max-num-batched-tokens: 512 gpu-memory-utilization: 0.95 - async-scheduling: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' benchmark: diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index c75507e33..3a279b6d3 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -34,6 +34,15 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8" export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro" export SRT_SLURM_MODEL_PREFIX="dsv4-fp8" + # NVIDIA/srt-slurm@sa-submission-q2-2026 installs ai-dynamo 1.0.1, + # which imports vllm.inputs.data.TokensPrompt — a path the DSV4 + # vLLM wheel has removed. Switch to alec-flowers' fork (head of + # https://github.com/NVIDIA/srt-slurm/pull/71) which supports + # dynamo.hash pinning so the recipe can pick a dynamo commit + # compatible with the DSV4 vllm.inputs layout. Matches PR #1129 + # on GB200. + export SRT_SLURM_REPO_URL="https://github.com/alec-flowers/srt-slurm.git" + export SRT_SLURM_REF="d60e3f1c7921721e52af01afaab59a70a1631106" else echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION" exit 1 @@ -50,9 +59,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + git clone "${SRT_SLURM_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + git checkout "${SRT_SLURM_REF:-sa-submission-q2-2026}" # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until # the upstream PR lands; cp -r merges directories on GNU cp. From b92ef5aa5d2e6d761ac9469a9033dc4250c0f758 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 14:13:50 -0700 Subject: [PATCH 05/13] dsv4 h100 recipes: drop API-server-only flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynamo vLLM worker argparse rejects --enable-auto-tool-choice and --tool-call-parser — the sweep from e0359c67 got past the module-import error but failed with "unrecognized arguments: --enable-auto-tool-choice --tool-call-parser deepseek_v4" during prefill worker startup. These flags (along with --tokenizer-mode and --reasoning-parser) are OpenAI API-server concerns. In disagg, Dynamo is the frontend and does tokenization / tool parsing itself; the vLLM workers are engine-only processes and expose only engine args. The H200 single-node recipe uses `vllm serve` directly (full API server), which is why those flags work there but fail here. Kimi K2.5 (only other working dynamo-vllm recipe) also omits all four flags — that's the precedent. Removed from both prefill and decode: tokenizer-mode: deepseek_v4 tool-call-parser: deepseek_v4 reasoning-parser: deepseek_v4 enable-auto-tool-choice: true Kept trust-remote-code: true (needed for DSV4's custom modeling code). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 13 +++++-------- .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 11 +++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index fb982b757..710376db3 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -74,13 +74,14 @@ backend: PYTHONUNBUFFERED: "1" vllm_config: + # Tokenizer mode, tool-call parser, reasoning parser, and + # enable-auto-tool-choice are OpenAI API-server flags; Dynamo is the + # frontend in this disagg setup and handles tool/reasoning parsing + # itself. The vLLM workers are engine-only processes and their argparse + # rejects those flags (matches kimi-k2.5 recipe which omits them too). prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - reasoning-parser: "deepseek_v4" - enable-auto-tool-choice: true trust-remote-code: true kv-cache-dtype: "fp8" block-size: 256 @@ -103,10 +104,6 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - reasoning-parser: "deepseek_v4" - enable-auto-tool-choice: true trust-remote-code: true kv-cache-dtype: "fp8" block-size: 256 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 8e194e719..5dd7e4d6c 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -65,13 +65,12 @@ backend: PYTHONUNBUFFERED: "1" vllm_config: + # API-server flags (tokenizer-mode, tool-call-parser, reasoning-parser, + # enable-auto-tool-choice) are handled by Dynamo frontend, not the + # vLLM engine workers. See 1k1k recipe for rationale. prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - reasoning-parser: "deepseek_v4" - enable-auto-tool-choice: true trust-remote-code: true kv-cache-dtype: "fp8" block-size: 256 @@ -94,10 +93,6 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - reasoning-parser: "deepseek_v4" - enable-auto-tool-choice: true trust-remote-code: true kv-cache-dtype: "fp8" block-size: 256 From b7336fdf9209693a3c586a3f614c12089dd47a9b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 14:41:56 -0700 Subject: [PATCH 06/13] dsv4 h100 recipes: route around NVSHMEM IPC failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers got past module import and weight load (471s), then died simultaneously with: /dvs/p4/build/sw/rel/gpgpu/toolkit/r12.9/main_nvshmem/src/host/mem/ mem_heap.cpp:exchange_heap_memory_handle:781: Fatal IPC Failure IPC failure: Sending data over socket failed: No such file or directory Root cause: `all2all-backend: deepep_{high_throughput,low_latency}` routes expert-parallel comms through NVSHMEM. The cu129 DSV4 vLLM wheel's NVSHMEM can't complete host-side IPC bootstrap after the workers enter the executor init phase. DSR1 on the same H100 nodes uses deepep successfully, but through a different container (nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0) with an older NVSHMEM. Fix — mirror PR 1129's GB200 approach: 1. Drop the `all2all-backend` override entirely. The DSV4 vLLM code picks its own default for this model, which routes through NCCL symmetric memory instead of NVSHMEM. 2. Add env vars: VLLM_USE_NCCL_SYMM_MEM=1 (prefer NCCL symm mem path) NCCL_CUMEM_ENABLE=1 (CUDA unified memory companion) Skipped NCCL_MNNVL_ENABLE and NCCL_NVLS_ENABLE (Blackwell-only; MNNVL is GB200 NVSwitch fabric, NVLS is NVLink SHARP — neither exists on H100). Keeps all H100-specific knobs (VLLM_USE_DEEP_GEMM, VLLM_MOE_DP_CHUNK_SIZE=192, VLLM_SKIP_P2P_CHECK). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 12 ++++++++++-- .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 8 ++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 710376db3..26bc1bfe7 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -50,8 +50,16 @@ backend: type: vllm connector: nixl + # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL + # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM + # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp + # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is + # the companion flag. Matches PR 1129 GB200 (where deepep's NVSHMEM has + # the same issue). prefill_environment: VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -63,6 +71,8 @@ backend: decode_environment: VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" VLLM_MOE_DP_CHUNK_SIZE: "192" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" @@ -87,7 +97,6 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - all2all-backend: "deepep_high_throughput" tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-size: 16 @@ -109,7 +118,6 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - all2all-backend: "deepep_low_latency" tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-size: 16 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 5dd7e4d6c..71df9989b 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -41,8 +41,12 @@ backend: type: vllm connector: nixl + # See 1k1k recipe for the rationale behind VLLM_USE_NCCL_SYMM_MEM / + # NCCL_CUMEM_ENABLE — the DSV4 wheel's NVSHMEM fails IPC bootstrap. prefill_environment: VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -54,6 +58,8 @@ backend: decode_environment: VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" VLLM_MOE_DP_CHUNK_SIZE: "192" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" @@ -76,7 +82,6 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - all2all-backend: "deepep_high_throughput" tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-size: 16 @@ -98,7 +103,6 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - all2all-backend: "deepep_low_latency" tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-size: 16 From 71ac58a85bc3bcd28a566ab129275c975669c7fc Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 15:29:06 -0700 Subject: [PATCH 07/13] dsv4 h100 recipes: lower gpu-memory-utilization 0.95 -> 0.85 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 24913192394 got past every prior failure (NVSHMEM/IPC, module import, argparse) but OOMed during compile_or_warm_up_model: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 79.19 GiB of which 93.00 MiB is free. PyTorch: 72.99 GiB | CUDA Graphs: 1.28 GiB File ".../vllm/model_executor/layers/sparse_attn_indexer.py", line 122 DSV4's "Lightning Indexer" sparse attention layer allocates transient torch.empty buffers that aren't accounted for in vLLM's KV cache profiling. With gpu-memory-utilization=0.95, vLLM reserves ~75 GiB of each H100's 79 GiB usable, leaving only ~4 GiB for non-PyTorch state (NCCL buffers, NVSHMEM scratch, the indexer's transient allocations). The indexer's 512 MiB allocation tips it over. The H200 single-node DSV4 recipe uses 0.95 and works because each H200 has 141 GiB/GPU — 4 GiB headroom is enough there. PR 1129 uses 0.88 (prefill) / 0.9 (decode) on GB200's 192 GiB. DSR1 H100 disagg uses vLLM's default 0.9 and works because DSR1's MLA doesn't have the indexer overhead. 0.85 reserves ~12 GiB headroom on H100 80GB, well above the indexer's ~6 GiB working set. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 14 ++++++++++++-- .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 8 ++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 26bc1bfe7..48fb5e9c0 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -106,7 +106,12 @@ backend: max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - gpu-memory-utilization: 0.95 + # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer + # transients on top of vLLM's reserved KV/weights/activations. The H200 + # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB + # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and + # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB. + gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true @@ -126,7 +131,12 @@ backend: max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - gpu-memory-utilization: 0.95 + # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer + # transients on top of vLLM's reserved KV/weights/activations. The H200 + # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB + # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and + # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB. + gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 71df9989b..4e36149aa 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -91,7 +91,9 @@ backend: max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - gpu-memory-utilization: 0.95 + # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom + # for DSV4's sparse attention indexer. + gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true @@ -111,7 +113,9 @@ backend: max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - gpu-memory-utilization: 0.95 + # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom + # for DSV4's sparse attention indexer. + gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' From 5ce459becabbb26bf79a9e1d6eaadbff7d4cf070 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 16:21:04 -0700 Subject: [PATCH 08/13] dsv4 h100 recipes: disable sa-bench chat-template path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 24914869373: server starts successfully (eval-only succeeds in 33m, end-to-end gsm8k completions). The throughput jobs fail before sending a single request: ValueError: Cannot use chat template functions because tokenizer.chat_template is not set File "/srtctl-benchmarks/sa-bench/benchmark_serving.py", line 346, in sample_random_requests chat_template_dummy = tokenizer.apply_chat_template(...) DSV4-Pro's HF tokenizer ships without a chat_template attribute. The server uses tokenizer-mode=deepseek_v4 (set automatically from the model's tokenizer_config.json) to handle templating itself, but sa-bench's prompt-construction path runs a *local* HF apply_chat_template before sending — and that raises with no template to apply. Eval works because lm-eval-harness sends raw messages to /v1/chat/completions; the server templates them via Dynamo's parser. Set `use_chat_template: false` on both recipes' benchmark blocks (matches PR 1129). sa-bench will send raw random text, which is what the throughput benchmark wants anyway. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 5 +++++ .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 3 +++ 2 files changed, 8 insertions(+) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 48fb5e9c0..90f5938a7 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -147,3 +147,8 @@ benchmark: osl: 1024 concurrencies: "4x8x16x32x64x128" req_rate: "inf" + # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The + # server uses --tokenizer-mode deepseek_v4 to handle templating itself, + # but sa-bench's local apply_chat_template path raises ValueError. + # Send raw prompts; the server handles formatting. + use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 4e36149aa..5cb4ff084 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -126,3 +126,6 @@ benchmark: osl: 1024 concurrencies: "4x8x16x32x64" req_rate: "inf" + # See 1k1k recipe rationale — DSV4-Pro tokenizer has no chat_template; + # the server handles formatting via --tokenizer-mode deepseek_v4. + use_chat_template: false From 65d223f30e60d7492f986f3e1b21f3896decae93 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:11:25 -0700 Subject: [PATCH 09/13] dsv4 h100: add TEP variant + du -sh model size diagnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand the search space with a TEP-style recipe alongside the existing DEP, following the dsr1-fp8-h100-dynamo-sglang TEP/DEP split pattern. The h100-multinode pool is exactly 4 nodes and DSV4-Pro weights need >=2 nodes per side, so we cannot add more workers (1P+1D = 4 nodes is the only fit). The TEP variant therefore differs from DEP by changing each worker's *internal* parallelism, not the worker count: DEP (existing): tp=1, dp=16, ep=16, dp-attn=true 16 independent attention paths, sharded experts. Better at high concurrency / throughput. TEP (new): tp=16, dp=1, ep=16, dp-attn=false Single replica spread across all 16 GPUs, sharded experts. All 16 GPUs cooperate on each forward pass. Cross-node TP routes attn all-reduce + MoE all2all over IB — expensive per token, but latency wins at small batch sizes (conc 4-32). Concurrency split per the user's hint ("DEP for high conc, TEP for low conc"): 1k1k TEP: [4, 8, 16, 32] 1k1k DEP: [64, 128, 256] 8k1k TEP: [4, 8, 16] 8k1k DEP: [32, 64, 128] Also extends the DEP high-conc tail by one point each side (1k1k 128 -> 256, 8k1k 64 -> 128). TEP recipe drops `data-parallel-hybrid-lb` (no DP) and lowers `max-num-seqs` to 64 / `max-num-batched-tokens` to 512 since cudagraph capture would otherwise reserve memory for batch shapes never reached at conc<=32. Keeps the existing DSV4 startup workarounds (VLLM_USE_NCCL_SYMM_MEM, gpu-memory-utilization=0.85, no all2all-backend override, etc). Doubles the matrix from 2 to 4 entries (validated via MultiNodeMatrixEntry). Also adds `du -sh "$MODEL_PATH"` in the dynamo-vllm branch of launch_h100-dgxc-slurm.sh so model size shows in CI output — useful for catching partial downloads or wrong revisions before the 8-min weight-load step. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 46 ++++++- .../disagg-h100-fp8-1p1d-tep16-tep16.yaml | 129 ++++++++++++++++++ .../disagg-h100-fp8-1p1d-tep16-tep16.yaml | 111 +++++++++++++++ runners/launch_h100-dgxc-slurm.sh | 3 + 4 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1a5b1de92..f737a6946 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2462,11 +2462,37 @@ dsv4-fp8-h100-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true + # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode + # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more + # workers). They differ in how each worker's 16 GPUs are organised: + # - TEP (tp:16,ep:16,dp-attn:false): one replica spread across all 16 + # GPUs, experts sharded. Better latency at low conc — every request + # gets the full compute. Cross-node TP is expensive per token but + # wins when batch is small. + # - DEP (tp:1,ep:16,dp-attn:true): 16 independent attention paths, + # experts sharded. Higher batching capacity, better at high conc. + # Pattern follows dsr1-fp8-h100-dynamo-sglang's TEP/DEP split. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - conc-list: [4, 8, 16, 32, 64, 128] + # TEP — low conc (latency-bound) + - conc-list: [4, 8, 16, 32] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + # DEP — high conc (throughput-bound) + - conc-list: [64, 128, 256] prefill: num-worker: 1 tp: 1 @@ -2483,7 +2509,23 @@ dsv4-fp8-h100-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [4, 8, 16, 32, 64] + # TEP — low conc (latency-bound) + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + # DEP — high conc (throughput-bound) + - conc-list: [32, 64, 128] prefill: num-worker: 1 tp: 1 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml new file mode 100644 index 000000000..a06bf738b --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -0,0 +1,129 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) +# +# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same 2P+2D footprint +# (4 nodes total) but each worker uses tensor parallel across all 16 GPUs +# instead of data parallel. +# +# - DEP variant (high conc): tp=1, dp=16, ep=16. 16 independent attention +# paths across the 16 GPUs, sharded experts. Maximises batching capacity. +# - TEP variant (this file, low conc): tp=16, dp=1, ep=16. One replica +# spread across all 16 GPUs, with experts also sharded across the same +# ranks. All 16 GPUs cooperate on a single forward pass — gives lower +# latency at small batch sizes since each request gets all the compute. +# +# Cross-node TP=16 routes attention all-reduce + MoE all2all across the IB +# fabric. That's expensive per-token but only meaningful at high +# concurrency; at conc 4-32 the latency win from more compute-per-request +# beats the IB overhead. +# +# Same DSV4 startup workarounds as the DEP recipe (alec-flowers fork +# pinning ai-dynamo to 6a159fe, NCCL_SYMM_MEM, no all2all-backend, drop +# API-server flags, gpu-memory-utilization 0.85 for indexer headroom). + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + data-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + # TEP serves smaller batches; cap max-num-seqs so cudagraph capture + # doesn't waste memory on shapes we won't hit at conc <= 32. + max-num-seqs: 64 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + data-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml new file mode 100644 index 000000000..9e82c02f3 --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -0,0 +1,111 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) — 8k/1k +# +# See 1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml for the full DEP-vs-TEP +# rationale and the DSV4 startup workarounds. This file just changes the +# benchmark block to ISL=8192 and trims the conc list. + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + data-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + data-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 64 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + req_rate: "inf" + use_chat_template: false diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 3a279b6d3..26eabad89 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -47,6 +47,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION" exit 1 fi + # Verify the weights are staged and log their size (catches partial + # downloads / wrong revisions before we burn 8 min on weight load). + du -sh "$MODEL_PATH" 2>/dev/null || echo "WARNING: could not stat $MODEL_PATH" else echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" exit 1 From 1bdeb9ef09b5b466c4ac601b210ba12b89fca7e1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:35:26 -0700 Subject: [PATCH 10/13] dsv4 h100 recipes: replace broken TEP with low-conc DEP variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sweep 24921015519 surfaced that cross-node TP=16 doesn't work with the Dynamo+vLLM stack: pydantic_core._pydantic_core.ValidationError: 1 validation error for ParallelConfig Value error, World size (16) is larger than the number of available GPUs (1) in this node. If this is intentional and you are using: - ray, set '--distributed-executor-backend ray'. - multiprocessing, set '--nnodes' appropriately. Dynamo spawns one vLLM process per GPU; each process only sees its single local GPU and vLLM rejects world_size=16. Working around this would need --distributed-executor-backend=ray which Dynamo doesn't coordinate. None of the working DSV4 vLLM recipes (kimi GB200, DSR1 H100, PR 1129 GB200) use cross-node TP either — the execution model assumes one process per GPU. So drop TEP entirely; instead deliver two DEP recipes per ISL/OSL that differ in batch tuning: DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256, enforce-eager=true on decode (no cudagraph). Smaller cudagraph capture footprint, faster warmup, no decode kernel-launch optimization (irrelevant at conc<=32 where network round-trips dominate per-token latency). DEP (high conc, existing): max-num-seqs=512, max-num-batched-tokens =512, decode cudagraph enabled. Higher batching throughput at conc>=64. Conc splits unchanged from previous attempt: 1k1k eager [4,8,16,32] 1k1k dep [64,128,256] 8k1k eager [4,8,16] 8k1k dep [32,64,128] Same 4 matrix entries, all with the same tp=1/dp=16/ep=16/dp-attn=true metadata; differentiation is via the CONFIG_FILE pointer in additional-settings (mirrors how the trtllm dsr1-h100 recipes encode multiple variants of the same topology). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 51 ++++++++-------- ...sagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} | 58 ++++++++++--------- ...sagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} | 26 +++++---- 3 files changed, 72 insertions(+), 63 deletions(-) rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/{disagg-h100-fp8-1p1d-tep16-tep16.yaml => disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} (62%) rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/{disagg-h100-fp8-1p1d-tep16-tep16.yaml => disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml} (82%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f737a6946..c04c75d79 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2464,34 +2464,37 @@ dsv4-fp8-h100-dynamo-vllm: disagg: true # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more - # workers). They differ in how each worker's 16 GPUs are organised: - # - TEP (tp:16,ep:16,dp-attn:false): one replica spread across all 16 - # GPUs, experts sharded. Better latency at low conc — every request - # gets the full compute. Cross-node TP is expensive per token but - # wins when batch is small. - # - DEP (tp:1,ep:16,dp-attn:true): 16 independent attention paths, - # experts sharded. Higher batching capacity, better at high conc. - # Pattern follows dsr1-fp8-h100-dynamo-sglang's TEP/DEP split. + # workers). Cross-node TP=16 (TEP) was attempted but the Dynamo+vLLM + # stack only supports one process per GPU, and vLLM's argparse rejects + # world_size=16 with only 1 local GPU. Both recipes therefore share the + # DEP topology (tp=1, dp=16, ep=16, dp-attn=true) and differ in batch + # tuning: + # - DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256, + # enforce-eager=true on decode (no cudagraph). Smaller memory and + # faster warmup; trades decode kernel-launch overhead that doesn't + # matter at conc<=32. + # - DEP (high conc): max-num-seqs=512, max-num-batched-tokens=512, + # decode cudagraph enabled. Higher batch throughput. seq-len-configs: - isl: 1024 osl: 1024 search-space: - # TEP — low conc (latency-bound) + # DEP-eager — low conc (smaller batch, no decode cudagraph) - conc-list: [4, 8, 16, 32] prefill: num-worker: 1 - tp: 16 + tp: 1 ep: 16 - dp-attn: false + dp-attn: true additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml" decode: num-worker: 1 - tp: 16 + tp: 1 ep: 16 - dp-attn: false - # DEP — high conc (throughput-bound) + dp-attn: true + # DEP — high conc (throughput-bound, decode cudagraph) - conc-list: [64, 128, 256] prefill: num-worker: 1 @@ -2509,22 +2512,22 @@ dsv4-fp8-h100-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # TEP — low conc (latency-bound) + # DEP-eager — low conc (smaller batch, no decode cudagraph) - conc-list: [4, 8, 16] prefill: num-worker: 1 - tp: 16 + tp: 1 ep: 16 - dp-attn: false + dp-attn: true additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml" decode: num-worker: 1 - tp: 16 + tp: 1 ep: 16 - dp-attn: false - # DEP — high conc (throughput-bound) + dp-attn: true + # DEP — high conc (throughput-bound, decode cudagraph) - conc-list: [32, 64, 128] prefill: num-worker: 1 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml similarity index 62% rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml index a06bf738b..e7256c46a 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml @@ -1,26 +1,29 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) # -# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same 2P+2D footprint -# (4 nodes total) but each worker uses tensor parallel across all 16 GPUs -# instead of data parallel. +# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same topology +# (tp=1, dp=16, ep=16, dp-attn=true) and same 4-node footprint, but +# tuned for low concurrency (4-32): # -# - DEP variant (high conc): tp=1, dp=16, ep=16. 16 independent attention -# paths across the 16 GPUs, sharded experts. Maximises batching capacity. -# - TEP variant (this file, low conc): tp=16, dp=1, ep=16. One replica -# spread across all 16 GPUs, with experts also sharded across the same -# ranks. All 16 GPUs cooperate on a single forward pass — gives lower -# latency at small batch sizes since each request gets all the compute. +# - max-num-seqs: 64 (vs 512). Engine never queues more than ~conc +# sequences at conc<=32, so the larger budget just wastes cudagraph +# capture memory. +# - max-num-batched-tokens: 256 (vs 512). Smaller prefill chunks = +# lower TTFT for sparse traffic. Throughput penalty is irrelevant +# at low conc. +# - enforce-eager: true on decode (vs cudagraph). Cudagraph capture +# reserves ~1-2 GiB per worker and adds ~30s warmup. At low conc +# the per-token kernel-launch overhead is dominated by network +# round-trips anyway, so eager mode is a fine tradeoff. # -# Cross-node TP=16 routes attention all-reduce + MoE all2all across the IB -# fabric. That's expensive per-token but only meaningful at high -# concurrency; at conc 4-32 the latency win from more compute-per-request -# beats the IB overhead. +# Originally targeted as a TEP variant (tp=16) but Dynamo's vLLM workers +# spawn one process per GPU and vLLM rejects world_size=16 with only 1 +# local GPU per process. Cross-node TP would need +# --distributed-executor-backend=ray which Dynamo doesn't coordinate. +# So we keep DEP topology and differentiate by batch tuning instead. # -# Same DSV4 startup workarounds as the DEP recipe (alec-flowers fork -# pinning ai-dynamo to 6a159fe, NCCL_SYMM_MEM, no all2all-backend, drop -# API-server flags, gpu-memory-utilization 0.85 for indexer headroom). +# All other DSV4 startup workarounds match the high-conc DEP recipe. -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16" +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager" model: path: "dsv4-fp8" @@ -68,6 +71,7 @@ backend: VLLM_USE_DEEP_GEMM: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + VLLM_MOE_DP_CHUNK_SIZE: "192" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -86,16 +90,15 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 16 + tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true max-model-len: 16384 - # TEP serves smaller batches; cap max-num-seqs so cudagraph capture - # doesn't waste memory on shapes we won't hit at conc <= 32. max-num-seqs: 64 - max-num-batched-tokens: 512 + max-num-batched-tokens: 256 gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true @@ -108,17 +111,18 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 16 + tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true + enforce-eager: true max-model-len: 16384 max-num-seqs: 64 - max-num-batched-tokens: 512 + max-num-batched-tokens: 256 gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml similarity index 82% rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml index 9e82c02f3..e0b2af853 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml @@ -1,10 +1,9 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, TEP variant) — 8k/1k +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) — 8k/1k # -# See 1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml for the full DEP-vs-TEP -# rationale and the DSV4 startup workarounds. This file just changes the -# benchmark block to ISL=8192 and trims the conc list. +# See 1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml for full rationale. +# Same low-conc-tuned DEP variant; only the benchmark block differs. -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k" +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager-8k1k" model: path: "dsv4-fp8" @@ -52,6 +51,7 @@ backend: VLLM_USE_DEEP_GEMM: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + VLLM_MOE_DP_CHUNK_SIZE: "192" VLLM_SKIP_P2P_CHECK: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -70,14 +70,15 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 16 + tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true max-model-len: 16384 max-num-seqs: 64 - max-num-batched-tokens: 512 + max-num-batched-tokens: 256 gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true @@ -90,17 +91,18 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 16 + tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true + enforce-eager: true max-model-len: 16384 max-num-seqs: 64 - max-num-batched-tokens: 512 + max-num-batched-tokens: 256 gpu-memory-utilization: 0.85 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' benchmark: type: "sa-bench" From 17dcc847a5da533fce8027fe4e6a15a992138d15 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:38:15 -0700 Subject: [PATCH 11/13] dsv4 h100: revert to single high-conc DEP config (working from run 24914869373) The eager low-conc DEP variant added in 1bdeb9ef was untested, and the TEP variant before that didn't work at all on Dynamo+vLLM. Drop both and revert to the single-DEP search-space form that successfully served gsm8k eval-only in run 24914869373: 1k1k DEP: conc [4, 8, 16, 32, 64, 128] 8k1k DEP: conc [4, 8, 16, 32, 64] Each entry uses tp=1, dp=16, ep=16, dp-attn=true (1P+1D filling the 4-node h100-multinode pool). max-num-seqs=512, decode cudagraph on, gpu-memory-utilization=0.85. Removes: - benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml - benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 53 +------ ...isagg-h100-fp8-1p1d-dep16-dep16-eager.yaml | 133 ------------------ ...isagg-h100-fp8-1p1d-dep16-dep16-eager.yaml | 113 --------------- 3 files changed, 6 insertions(+), 293 deletions(-) delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c04c75d79..f3a0158d4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2462,40 +2462,15 @@ dsv4-fp8-h100-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true - # Two recipes per ISL/OSL, both 1P+1D over the full 4-node h100-multinode - # pool (DSV4-Pro weights need >=2 nodes per side, so we cannot fit more - # workers). Cross-node TP=16 (TEP) was attempted but the Dynamo+vLLM - # stack only supports one process per GPU, and vLLM's argparse rejects - # world_size=16 with only 1 local GPU. Both recipes therefore share the - # DEP topology (tp=1, dp=16, ep=16, dp-attn=true) and differ in batch - # tuning: - # - DEP-eager (low conc): max-num-seqs=64, max-num-batched-tokens=256, - # enforce-eager=true on decode (no cudagraph). Smaller memory and - # faster warmup; trades decode kernel-launch overhead that doesn't - # matter at conc<=32. - # - DEP (high conc): max-num-seqs=512, max-num-batched-tokens=512, - # decode cudagraph enabled. Higher batch throughput. + # 1P+1D DEP over the full 4-node h100-multinode pool (DSV4-Pro weights + # need >=2 nodes per side, so we cannot fit more workers). High-conc + # only — this is the configuration that successfully served eval-only + # gsm8k end-to-end in run 24914869373. seq-len-configs: - isl: 1024 osl: 1024 search-space: - # DEP-eager — low conc (smaller batch, no decode cudagraph) - - conc-list: [4, 8, 16, 32] - prefill: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml" - decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - # DEP — high conc (throughput-bound, decode cudagraph) - - conc-list: [64, 128, 256] + - conc-list: [4, 8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 1 @@ -2512,23 +2487,7 @@ dsv4-fp8-h100-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # DEP-eager — low conc (smaller batch, no decode cudagraph) - - conc-list: [4, 8, 16] - prefill: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml" - decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - # DEP — high conc (throughput-bound, decode cudagraph) - - conc-list: [32, 64, 128] + - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 1 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml deleted file mode 100644 index e7256c46a..000000000 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml +++ /dev/null @@ -1,133 +0,0 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) -# -# Sister recipe to disagg-h100-fp8-1p1d-dep16-dep16.yaml. Same topology -# (tp=1, dp=16, ep=16, dp-attn=true) and same 4-node footprint, but -# tuned for low concurrency (4-32): -# -# - max-num-seqs: 64 (vs 512). Engine never queues more than ~conc -# sequences at conc<=32, so the larger budget just wastes cudagraph -# capture memory. -# - max-num-batched-tokens: 256 (vs 512). Smaller prefill chunks = -# lower TTFT for sparse traffic. Throughput penalty is irrelevant -# at low conc. -# - enforce-eager: true on decode (vs cudagraph). Cudagraph capture -# reserves ~1-2 GiB per worker and adds ~30s warmup. At low conc -# the per-token kernel-launch overhead is dominated by network -# round-trips anyway, so eager mode is a fine tradeoff. -# -# Originally targeted as a TEP variant (tp=16) but Dynamo's vLLM workers -# spawn one process per GPU and vLLM rejects world_size=16 with only 1 -# local GPU per process. Cross-node TP would need -# --distributed-executor-backend=ray which Dynamo doesn't coordinate. -# So we keep DEP topology and differentiate by batch tuning instead. -# -# All other DSV4 startup workarounds match the high-conc DEP recipe. - -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager" - -model: - path: "dsv4-fp8" - container: "vllm/vllm-openai:deepseekv4-cu129" - precision: "fp8" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -resources: - gpu_type: "h100" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 16 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: nixl - - prefill_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - decode_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_MOE_DP_CHUNK_SIZE: "192" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 64 - max-num-batched-tokens: 256 - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 64 - max-num-batched-tokens: 256 - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml deleted file mode 100644 index e0b2af853..000000000 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, low-conc DEP) — 8k/1k -# -# See 1k1k/disagg-h100-fp8-1p1d-dep16-dep16-eager.yaml for full rationale. -# Same low-conc-tuned DEP variant; only the benchmark block differs. - -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-eager-8k1k" - -model: - path: "dsv4-fp8" - container: "vllm/vllm-openai:deepseekv4-cu129" - precision: "fp8" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -resources: - gpu_type: "h100" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 16 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: nixl - - prefill_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - decode_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_MOE_DP_CHUNK_SIZE: "192" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 64 - max-num-batched-tokens: 256 - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 64 - max-num-batched-tokens: 256 - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16" - req_rate: "inf" - use_chat_template: false From 3cfdb7b468ee7b5b868060a0e200b0fa83032aae Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 22:23:14 -0700 Subject: [PATCH 12/13] Bump dsv4 H100 health_check timeout to 4h, slurm time_limit to 8h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 24922713022 hit the default 1800s orchestrator deadline on all three matrix jobs (1k1k bench, 8k1k bench, 8k1k eval). Concurrent multinode matrix jobs starve the same Lustre OSTs — first shard load took 423s, shard 8/64 was reached at 16 min, projected total weight load ~107 min. Match the GB200 dsv4 recipes which already added these blocks for the same reason. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 17 +++++++++++++++++ .../8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 90f5938a7..5a5164072 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -32,6 +32,23 @@ dynamo: setup_script: vllm-container-deps.sh +# Bumped from the srt-slurm 6h default. Three multinode matrix jobs (1k1k +# bench, 8k1k bench, 8k1k eval) launch concurrently from the run-sweep +# workflow and all read the same /mnt/nfs/lustre/models/dsv4-fp8 — combined +# with cudagraph capture this can exceed the default wall clock. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24922713022 +# saw shard 1/64 take 423s and shard 8/64 reach only 16 min into the load — +# the 32 H100 workers per job × 3 concurrent matrix jobs starve the same +# Lustre OSTs. Default 1800s deadline fired before any job became healthy. +# Match the GB200 dsv4 recipes; over-long deadline just idles, doesn't burn +# compute. +health_check: + max_attempts: 1440 + interval_seconds: 10 + resources: gpu_type: "h100" gpus_per_node: 8 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml index 5cb4ff084..ef9a8224e 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml @@ -23,6 +23,16 @@ dynamo: setup_script: vllm-container-deps.sh +# See 1k1k recipe for slurm.time_limit and health_check rationale — three +# concurrent matrix jobs starve the same Lustre OSTs and the default 1800s +# orchestrator deadline fires before any job becomes healthy. +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + resources: gpu_type: "h100" gpus_per_node: 8 From f798361a93a6b4e46260c4ffd693de58206b2ca5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 23:34:39 -0700 Subject: [PATCH 13/13] Switch dsv4 H100 disagg from DP=16 to cross-node TP=16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DSV4-Pro per-rank weights are 74.99 GiB at DP=16/EP=16 — H100 80GB leaves only ~4 GiB headroom and sparse_attn_indexer's profile_run torch.empty(512 MiB) OOMs (run 24923521075). Cross-node TP=16 shards the model 16-way across 2 nodes (~5 GiB per rank). srt-slurm's vllm.py:386-388 emits --headless on the secondary node when data-parallel-size is absent and the worker spans nodes; Dynamo's run_dynamo_headless calls vLLM's run_headless which uses MultiprocExecutor + torch.distributed (no Ray) to form the cross-node PG. NCCL TP all-reduce flows over IB on every layer — slower per-token than intra-node NVLink, but the only way to fit DSV4-Pro at 80 GB. Other changes: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True for fragmentation; gpu-memory-utilization back to 0.95 (matches H200); enforce-eager on decode for the first attempt (cross-node cudagraphs are fragile). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 37 ++-- .../disagg-h100-fp8-1p1d-dep16-dep16.yaml | 171 ---------------- .../disagg-h100-fp8-1p1d-tep16-tep16.yaml | 183 ++++++++++++++++++ ... => disagg-h100-fp8-1p1d-tep16-tep16.yaml} | 47 ++--- 4 files changed, 215 insertions(+), 223 deletions(-) delete mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml create mode 100644 benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml rename benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/{disagg-h100-fp8-1p1d-dep16-dep16.yaml => disagg-h100-fp8-1p1d-tep16-tep16.yaml} (57%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0c8457169..1182a0070 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2450,8 +2450,10 @@ dsv4-fp8-h200-vllm: # DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm. # 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool). -# Minimum viable disagg shape: DSV4-Pro FP8 weights (~862 GB) don't fit on one -# H100 node (8x80GB=640GB), so each side must own >=2 nodes. Recipes bundled +# Cross-node TP=16 over IB: DSV4-Pro per-rank weight footprint at DP=16/EP=16 +# is 74.99 GiB on H100 80GB (run 24923521075 OOM'd in sparse_attn_indexer +# profile_run with only ~4 GiB headroom). TP=16 shards the model 16-way +# across the 2 nodes, dropping per-rank weights to ~5 GiB. Recipe bundled # locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed. dsv4-fp8-h100-dynamo-vllm: image: vllm/vllm-openai:deepseekv4-cu129 @@ -2462,10 +2464,9 @@ dsv4-fp8-h100-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true - # 1P+1D DEP over the full 4-node h100-multinode pool (DSV4-Pro weights - # need >=2 nodes per side, so we cannot fit more workers). High-conc - # only — this is the configuration that successfully served eval-only - # gsm8k end-to-end in run 24914869373. + # 1P+1D TEP=16 across the full 4-node h100-multinode pool. + # Each prefill/decode worker spans 2 nodes via Dynamo's --headless + # secondary-node mode + vLLM's MultiprocExecutor + torch.distributed PG. seq-len-configs: - isl: 1024 osl: 1024 @@ -2473,34 +2474,34 @@ dsv4-fp8-h100-dynamo-vllm: - conc-list: [4, 8, 16, 32, 64, 128] prefill: num-worker: 1 - tp: 1 + tp: 16 ep: 16 - dp-attn: true + dp-attn: false additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml" + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" decode: num-worker: 1 - tp: 1 + tp: 16 ep: 16 - dp-attn: true + dp-attn: false - isl: 8192 osl: 1024 search-space: - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 1 + tp: 16 ep: 16 - dp-attn: true + dp-attn: false additional-settings: - # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml" + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" decode: num-worker: 1 - tp: 1 + tp: 16 ep: 16 - dp-attn: true + dp-attn: false # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml deleted file mode 100644 index 5a5164072..000000000 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ /dev/null @@ -1,171 +0,0 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) -# -# Forked from NVIDIA/srt-slurm recipes/vllm/deepseek-r1/disagg-h100-16gpu.yaml. -# Engine flags updated to match the single-node H200 DSV4 recipe (deepseek_v4 -# tokenizer/parsers, FP8 KV cache, block_size=256, prefix caching disabled, -# compilation mode 0). Kept from DSR1: NixlConnector P<->D KV transfer, -# VLLM_MOE_DP_CHUNK_SIZE=192 for H100 80GB (vs H200 141GB default of 384), -# deepep all2all backends, VLLM_USE_DEEP_GEMM. -# -# max-model-len is 16384, not H200's 800000 — KV for 800k context does not -# fit across two 80GB decode nodes. -# -# DP+EP configuration: -# - Each GPU runs its own vLLM process (tensor-parallel-size: 1) -# - 1 prefill endpoint x 16 GPUs (2 nodes, DP16) -> 16 prefill processes -# - 1 decode endpoint x 16 GPUs (2 nodes, DP16) -> 16 decode processes -# - Total: 32 GPUs across 4 nodes (fills the h100-multinode pool) - -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16" - -model: - path: "dsv4-fp8" - container: "vllm/vllm-openai:deepseekv4-cu129" - precision: "fp8" - -# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM -# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71), -# which extends the dynamo config to accept `hash` as well as `version`. -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -# Bumped from the srt-slurm 6h default. Three multinode matrix jobs (1k1k -# bench, 8k1k bench, 8k1k eval) launch concurrently from the run-sweep -# workflow and all read the same /mnt/nfs/lustre/models/dsv4-fp8 — combined -# with cudagraph capture this can exceed the default wall clock. -slurm: - time_limit: "8:00:00" - -# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24922713022 -# saw shard 1/64 take 423s and shard 8/64 reach only 16 min into the load — -# the 32 H100 workers per job × 3 concurrent matrix jobs starve the same -# Lustre OSTs. Default 1800s deadline fired before any job became healthy. -# Match the GB200 dsv4 recipes; over-long deadline just idles, doesn't burn -# compute. -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "h100" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 16 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: nixl - - # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL - # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM - # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp - # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is - # the companion flag. Matches PR 1129 GB200 (where deepep's NVSHMEM has - # the same issue). - prefill_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - decode_environment: - VLLM_USE_DEEP_GEMM: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_MOE_DP_CHUNK_SIZE: "192" - VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_SERVER_DEV_MODE: "1" - TILELANG_CLEANUP_TEMP_FILES: "1" - NVIDIA_GDRCOPY: "enabled" - GLOO_SOCKET_IFNAME: "eth0" - PYTHONUNBUFFERED: "1" - - vllm_config: - # Tokenizer mode, tool-call parser, reasoning parser, and - # enable-auto-tool-choice are OpenAI API-server flags; Dynamo is the - # frontend in this disagg setup and handles tool/reasoning parsing - # itself. The vLLM workers are engine-only processes and their argparse - # rejects those flags (matches kimi-k2.5 recipe which omits them too). - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 512 - max-num-batched-tokens: 512 - # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer - # transients on top of vLLM's reserved KV/weights/activations. The H200 - # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB - # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and - # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB. - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - kv-cache-dtype: "fp8" - block-size: 256 - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 16384 - max-num-seqs: 512 - max-num-batched-tokens: 512 - # H100 80GB needs ~6 GiB headroom for DSV4's sparse attention indexer - # transients on top of vLLM's reserved KV/weights/activations. The H200 - # single-node recipe uses 0.95, but H200 has 141 GiB/GPU so 4 GiB - # headroom is enough there. On H100, 0.95 leaves only ~4 GiB free and - # the indexer OOMs at warm-up. 0.85 reserves ~12 GiB. - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64x128" - req_rate: "inf" - # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The - # server uses --tokenizer-mode deepseek_v4 to handle templating itself, - # but sa-bench's local apply_chat_template path raises ValueError. - # Send raw prompts; the server handles formatting. - use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml new file mode 100644 index 000000000..049fa8218 --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -0,0 +1,183 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) +# +# DSV4-Pro FP8 weights are 74.99 GiB per DP rank — DP=16/EP=16 (one process +# per GPU) leaves only ~4 GiB headroom on H100 80GB and sparse_attn_indexer +# OOMs during profile_run (run 24923521075 hit this on every prefill and +# decode worker simultaneously). Switching to cross-node TP=16 shards +# dense+expert weights 16-way → ~5 GiB/GPU, plenty of headroom. +# +# Layout: +# - 1 prefill endpoint × TP=16 across 2 nodes (16 GPUs) +# - 1 decode endpoint × TP=16 across 2 nodes (16 GPUs) +# - Total: 32 GPUs across 4 nodes (fills h100-multinode pool) +# +# How the cross-node launch works: +# - srt-slurm's standard TP mode (no data-parallel-size) starts one srun +# per node, each process gets all 8 local GPUs. +# - Leader (node_rank=0): full Dynamo + vLLM with `--master-addr +# --nnodes 2 --node-rank 0 --tensor-parallel-size 16`. MultiprocExecutor +# spawns 8 local workers, then waits for the 8 remote workers. +# - Secondary (node_rank=1): adds `--headless` (srt-slurm vllm.py:386-388), +# which routes through dynamo's run_dynamo_headless → vLLM's run_headless → +# MultiprocExecutor(monitor_workers=False) joining the leader's PG over +# torch.distributed (master_addr / master_port). NCCL backs the TP +# all-reduce; on h100-multinode that flows over IB (no NVLink between +# nodes), so per-layer TP comms is the dominant latency cost — accepted +# as the only way to fit DSV4-Pro on 80 GB H100. + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM +# wheel AND that exposes --headless via run_dynamo_headless (see +# components/src/dynamo/vllm/main.py:80). Requires the alec-flowers/srt-slurm +# fork (NVIDIA/srt-slurm#71), which extends the dynamo config to accept +# `hash` as well as `version`. +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +# Bumped from the srt-slurm 6h default. Cross-node TP=16 over IB plus +# cudagraph capture extends post-load init noticeably; cold-cache Lustre +# weight load alone took 24 min in run 24923521075. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24923521075 +# observed full weight load taking ~1450s with three concurrent matrix +# jobs starving the same Lustre OSTs. Cross-node TP setup + indexer +# warm-up adds more on top. Match the GB200 dsv4 recipes; the cost of an +# over-long deadline is sitting idle, not wasted compute. +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL + # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM + # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp + # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is + # the companion flag. + # + # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True reduces fragmentation + # in the tight headroom — even with TP=16 sharding the model 16-way, a + # contiguous 512 MiB indexer scratch can still fail under fragmented + # 80 GB caches. The OOM error message itself recommends this exact flag. + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + vllm_config: + # tensor-parallel-size: 16 across 2 nodes (each srt-slurm srun process + # gets 8 local GPUs; vLLM forms a 16-rank PG via the master_addr + # handshake). data-parallel-size is intentionally absent — its presence + # would push srt-slurm into the per-GPU process layout, which is what + # broke the previous TEP=16 attempt with "World size (16) > available + # GPUs (1) in this node". + # + # enable-expert-parallel keeps experts sharded EP=16 along the same + # 16 ranks (each rank holds 1/16 of the routed experts). Communication + # is dominated by per-layer TP all-reduce + EP all-to-all, both over + # IB. + # + # enforce-eager on both sides for the first cross-node attempt: cudagraph + # capture across nodes is fragile (FULL_DECODE_ONLY graphs include the + # cross-node TP all-reduce) and the previous run's 1.48 GiB private-pool + # accumulation already burned the headroom. Drop graphs to ship; revisit + # for decode performance once the server is observed healthy. + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + # With TP=16 the per-rank model footprint drops from ~75 GiB to + # ~5 GiB, so we can match the H200 single-node 0.95 utilization. + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" + # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The + # server uses --tokenizer-mode deepseek_v4 to handle templating itself, + # but sa-bench's local apply_chat_template path raises ValueError. + # Send raw prompts; the server handles formatting. + use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml similarity index 57% rename from benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml rename to benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml index ef9a8224e..e31e8d531 100644 --- a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -1,31 +1,26 @@ -# DeepSeek-V4-Pro Disaggregated with vLLM (H100 2P+2D / 32-GPU) — 8k/1k +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) — 8k/1k # # Same engine flags as the 1k1k variant. Only the benchmark block differs # (ISL=8192, tighter concurrency sweep due to larger prefill work). # -# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-dep16-dep16.yaml -# for the full rationale (FP8 KV cache, block_size=256, deepseek_v4 parsers, -# NixlConnector, H100-tuned VLLM_MOE_DP_CHUNK_SIZE=192). +# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml +# for the full rationale (cross-node TP=16 layout, MultiprocExecutor + +# torch.distributed PG handshake, --headless secondary node, IB-backed +# NCCL TP all-reduce, deepseek_v4 parsers, NixlConnector). -name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-dep16-dep16-8k1k" +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k" model: path: "dsv4-fp8" container: "vllm/vllm-openai:deepseekv4-cu129" precision: "fp8" -# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM -# wheel. Requires the alec-flowers/srt-slurm fork (NVIDIA/srt-slurm#71), -# which extends the dynamo config to accept `hash` as well as `version`. dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true setup_script: vllm-container-deps.sh -# See 1k1k recipe for slurm.time_limit and health_check rationale — three -# concurrent matrix jobs starve the same Lustre OSTs and the default 1800s -# orchestrator deadline fires before any job becomes healthy. slurm: time_limit: "8:00:00" @@ -51,39 +46,33 @@ backend: type: vllm connector: nixl - # See 1k1k recipe for the rationale behind VLLM_USE_NCCL_SYMM_MEM / - # NCCL_CUMEM_ENABLE — the DSV4 wheel's NVSHMEM fails IPC bootstrap. prefill_environment: VLLM_USE_DEEP_GEMM: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_SERVER_DEV_MODE: "1" TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" decode_environment: VLLM_USE_DEEP_GEMM: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - VLLM_MOE_DP_CHUNK_SIZE: "192" VLLM_SKIP_P2P_CHECK: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_SERVER_DEV_MODE: "1" TILELANG_CLEANUP_TEMP_FILES: "1" NVIDIA_GDRCOPY: "enabled" GLOO_SOCKET_IFNAME: "eth0" PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" vllm_config: - # API-server flags (tokenizer-mode, tool-call-parser, reasoning-parser, - # enable-auto-tool-choice) are handled by Dynamo frontend, not the - # vLLM engine workers. See 1k1k recipe for rationale. prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -92,18 +81,14 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 + tensor-parallel-size: 16 pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom - # for DSV4's sparse attention indexer. - gpu-memory-utilization: 0.85 + gpu-memory-utilization: 0.95 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true @@ -115,20 +100,16 @@ backend: block-size: 256 no-enable-prefix-caching: true no-enable-flashinfer-autotune: true - tensor-parallel-size: 1 + tensor-parallel-size: 16 pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 enable-expert-parallel: true + enforce-eager: true max-model-len: 16384 max-num-seqs: 512 max-num-batched-tokens: 512 - # See 1k1k recipe for the rationale; H100 80GB needs ~6 GiB headroom - # for DSV4's sparse attention indexer. - gpu-memory-utilization: 0.85 + gpu-memory-utilization: 0.95 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' benchmark: type: "sa-bench" @@ -136,6 +117,4 @@ benchmark: osl: 1024 concurrencies: "4x8x16x32x64" req_rate: "inf" - # See 1k1k recipe rationale — DSV4-Pro tokenizer has no chat_template; - # the server handles formatting via --tokenizer-mode deepseek_v4. use_chat_template: false