Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
7dae672
Add H100 config: dsv4-fp8-dynamo-vllm (DeepSeek-V4-Pro multinode disagg)
Oseltamivir Apr 24, 2026
0cd54af
Update perf-changelog pr-link to PR 1142
Oseltamivir Apr 24, 2026
88b80c0
launch_h100: pre-create logs dir and tar outputs/ on early failure
Oseltamivir Apr 24, 2026
e0359c6
Fix dsv4 dynamo-vllm: switch to alec-flowers/srt-slurm@PR71 fork
Oseltamivir Apr 24, 2026
b92ef5a
dsv4 h100 recipes: drop API-server-only flags
Oseltamivir Apr 24, 2026
b7336fd
dsv4 h100 recipes: route around NVSHMEM IPC failure
Oseltamivir Apr 24, 2026
71ac58a
dsv4 h100 recipes: lower gpu-memory-utilization 0.95 -> 0.85
Oseltamivir Apr 24, 2026
3495a78
Merge branch 'main' into dsv4-fp8-h100-dynamo-vllm
Oseltamivir Apr 24, 2026
5ce459b
dsv4 h100 recipes: disable sa-bench chat-template path
Oseltamivir Apr 24, 2026
837179d
Merge branch 'main' into dsv4-fp8-h100-dynamo-vllm
Oseltamivir Apr 25, 2026
65d223f
dsv4 h100: add TEP variant + du -sh model size diagnostic
Oseltamivir Apr 25, 2026
1bdeb9e
dsv4 h100 recipes: replace broken TEP with low-conc DEP variant
Oseltamivir Apr 25, 2026
17dcc84
dsv4 h100: revert to single high-conc DEP config (working from run 24…
Oseltamivir Apr 25, 2026
f3693b7
Merge remote-tracking branch 'origin/main' into dsv4-fp8-h100-dynamo-…
Oseltamivir Apr 25, 2026
3cfdb7b
Bump dsv4 H100 health_check timeout to 4h, slurm time_limit to 8h
Oseltamivir Apr 25, 2026
f798361
Switch dsv4 H100 disagg from DP=16 to cross-node TP=16
Oseltamivir Apr 25, 2026
66d0da9
Merge branch 'main' into dsv4-fp8-h100-dynamo-vllm
Oseltamivir Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2461,6 +2461,61 @@ dsv4-fp8-h200-vllm:
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }

# DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm.
# 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool).
# Cross-node TP=16 over IB: DSV4-Pro per-rank weight footprint at DP=16/EP=16
# is 74.99 GiB on H100 80GB (run 24923521075 OOM'd in sparse_attn_indexer
# profile_run with only ~4 GiB headroom). TP=16 shards the model 16-way
# across the 2 nodes, dropping per-rank weights to ~5 GiB. Recipe bundled
# locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed.
dsv4-fp8-h100-dynamo-vllm:
image: vllm/vllm-openai:deepseekv4-cu129
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: h100-multinode
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
# 1P+1D TEP=16 across the full 4-node h100-multinode pool.
# Each prefill/decode worker spans 2 nodes via Dynamo's --headless
# secondary-node mode + vLLM's MultiprocExecutor + torch.distributed PG.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 16
ep: 16
dp-attn: false
additional-settings:
# benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
- "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 16
ep: 16
dp-attn: false
additional-settings:
# benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
- "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: false

# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300
# pareto sweep. The single-node schema has no explicit data-parallel-size
# field, so dp-attn=true is used as the existing vLLM script switch for DP4
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16)
#
# DSV4-Pro FP8 weights are 74.99 GiB per DP rank — DP=16/EP=16 (one process
# per GPU) leaves only ~4 GiB headroom on H100 80GB and sparse_attn_indexer
# OOMs during profile_run (run 24923521075 hit this on every prefill and
# decode worker simultaneously). Switching to cross-node TP=16 shards
# dense+expert weights 16-way → ~5 GiB/GPU, plenty of headroom.
#
# Layout:
# - 1 prefill endpoint × TP=16 across 2 nodes (16 GPUs)
# - 1 decode endpoint × TP=16 across 2 nodes (16 GPUs)
# - Total: 32 GPUs across 4 nodes (fills h100-multinode pool)
#
# How the cross-node launch works:
# - srt-slurm's standard TP mode (no data-parallel-size) starts one srun
# per node, each process gets all 8 local GPUs.
# - Leader (node_rank=0): full Dynamo + vLLM with `--master-addr <leader_ip>
# --nnodes 2 --node-rank 0 --tensor-parallel-size 16`. MultiprocExecutor
# spawns 8 local workers, then waits for the 8 remote workers.
# - Secondary (node_rank=1): adds `--headless` (srt-slurm vllm.py:386-388),
# which routes through dynamo's run_dynamo_headless → vLLM's run_headless →
# MultiprocExecutor(monitor_workers=False) joining the leader's PG over
# torch.distributed (master_addr / master_port). NCCL backs the TP
# all-reduce; on h100-multinode that flows over IB (no NVLink between
# nodes), so per-layer TP comms is the dominant latency cost — accepted
# as the only way to fit DSV4-Pro on 80 GB H100.

name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16"

model:
path: "dsv4-fp8"
container: "vllm/vllm-openai:deepseekv4-cu129"
precision: "fp8"

# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
# wheel AND that exposes --headless via run_dynamo_headless (see
# components/src/dynamo/vllm/main.py:80). Requires the alec-flowers/srt-slurm
# fork (NVIDIA/srt-slurm#71), which extends the dynamo config to accept
# `hash` as well as `version`.
dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh

# Bumped from the srt-slurm 6h default. Cross-node TP=16 over IB plus
# cudagraph capture extends post-load init noticeably; cold-cache Lustre
# weight load alone took 24 min in run 24923521075.
slurm:
time_limit: "8:00:00"

# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24923521075
# observed full weight load taking ~1450s with three concurrent matrix
# jobs starving the same Lustre OSTs. Cross-node TP setup + indexer
# warm-up adds more on top. Match the GB200 dsv4 recipes; the cost of an
# over-long deadline is sitting idle, not wasted compute.
health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "h100"
gpus_per_node: 8
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 16
gpus_per_decode: 16

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: nixl

# VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL
# symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM
# wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp
# "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is
# the companion flag.
#
# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True reduces fragmentation
# in the tight headroom — even with TP=16 sharding the model 16-way, a
# contiguous 512 MiB indexer scratch can still fail under fragmented
# 80 GB caches. The OOM error message itself recommends this exact flag.
prefill_environment:
VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SKIP_P2P_CHECK: "1"
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_SERVER_DEV_MODE: "1"
TILELANG_CLEANUP_TEMP_FILES: "1"
NVIDIA_GDRCOPY: "enabled"
GLOO_SOCKET_IFNAME: "eth0"
PYTHONUNBUFFERED: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

decode_environment:
VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SKIP_P2P_CHECK: "1"
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_SERVER_DEV_MODE: "1"
TILELANG_CLEANUP_TEMP_FILES: "1"
NVIDIA_GDRCOPY: "enabled"
GLOO_SOCKET_IFNAME: "eth0"
PYTHONUNBUFFERED: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

vllm_config:
# tensor-parallel-size: 16 across 2 nodes (each srt-slurm srun process
# gets 8 local GPUs; vLLM forms a 16-rank PG via the master_addr
# handshake). data-parallel-size is intentionally absent — its presence
# would push srt-slurm into the per-GPU process layout, which is what
# broke the previous TEP=16 attempt with "World size (16) > available
# GPUs (1) in this node".
#
# enable-expert-parallel keeps experts sharded EP=16 along the same
# 16 ranks (each rank holds 1/16 of the routed experts). Communication
# is dominated by per-layer TP all-reduce + EP all-to-all, both over
# IB.
#
# enforce-eager on both sides for the first cross-node attempt: cudagraph
# capture across nodes is fragile (FULL_DECODE_ONLY graphs include the
# cross-node TP all-reduce) and the previous run's 1.48 GiB private-pool
# accumulation already burned the headroom. Drop graphs to ship; revisit
# for decode performance once the server is observed healthy.
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
kv-cache-dtype: "fp8"
block-size: 256
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
tensor-parallel-size: 16
pipeline-parallel-size: 1
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 512
max-num-batched-tokens: 512
# With TP=16 the per-rank model footprint drops from ~75 GiB to
# ~5 GiB, so we can match the H200 single-node 0.95 utilization.
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
kv-cache-dtype: "fp8"
block-size: 256
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
tensor-parallel-size: 16
pipeline-parallel-size: 1
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 512
max-num-batched-tokens: 512
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x8x16x32x64x128"
req_rate: "inf"
# DSV4-Pro's HF tokenizer ships with no chat_template attribute. The
# server uses --tokenizer-mode deepseek_v4 to handle templating itself,
# but sa-bench's local apply_chat_template path raises ValueError.
# Send raw prompts; the server handles formatting.
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) — 8k/1k
#
# Same engine flags as the 1k1k variant. Only the benchmark block differs
# (ISL=8192, tighter concurrency sweep due to larger prefill work).
#
# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
# for the full rationale (cross-node TP=16 layout, MultiprocExecutor +
# torch.distributed PG handshake, --headless secondary node, IB-backed
# NCCL TP all-reduce, deepseek_v4 parsers, NixlConnector).

name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k"

model:
path: "dsv4-fp8"
container: "vllm/vllm-openai:deepseekv4-cu129"
precision: "fp8"

dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "h100"
gpus_per_node: 8
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 16
gpus_per_decode: 16

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: nixl

prefill_environment:
VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SKIP_P2P_CHECK: "1"
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_SERVER_DEV_MODE: "1"
TILELANG_CLEANUP_TEMP_FILES: "1"
NVIDIA_GDRCOPY: "enabled"
GLOO_SOCKET_IFNAME: "eth0"
PYTHONUNBUFFERED: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

decode_environment:
VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SKIP_P2P_CHECK: "1"
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_SERVER_DEV_MODE: "1"
TILELANG_CLEANUP_TEMP_FILES: "1"
NVIDIA_GDRCOPY: "enabled"
GLOO_SOCKET_IFNAME: "eth0"
PYTHONUNBUFFERED: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
kv-cache-dtype: "fp8"
block-size: 256
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
tensor-parallel-size: 16
pipeline-parallel-size: 1
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 512
max-num-batched-tokens: 512
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
kv-cache-dtype: "fp8"
block-size: 256
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
tensor-parallel-size: 16
pipeline-parallel-size: 1
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 512
max-num-batched-tokens: 512
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4x8x16x32x64"
req_rate: "inf"
use_chat_template: false
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1819,3 +1819,13 @@
- "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132

- config-keys:
- dsv4-fp8-h100-dynamo-vllm
description:
- "Add DeepSeek-V4-Pro FP8 H100 multinode disagg benchmark via dynamo-vllm"
- "2 prefill nodes + 2 decode nodes (32 H100s total, DP16/EP16 per side)"
- "Image: vllm/vllm-openai:deepseekv4-cu129"
- "Engine flags match H200 single-node recipe (deepseek_v4 tokenizer/parsers, FP8 KV cache, block size 256, prefix caching disabled)"
- "max-model-len 16384 (H100 80GB KV headroom; H200's 800k does not fit across 2 decode nodes)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1142
Loading
Loading