diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e4177ee8..58cdfe8b9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7606,7 +7606,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:deepseekv4-cu130 + image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -7615,102 +7615,94 @@ dsv4-fp4-gb200-dynamo-vllm: multinode: true disagg: true seq-len-configs: - # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's - # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg - # at this seq-len yet (PR #67 only publishes 8k/1k). - - isl: 1024 + - isl: 8192 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch - # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). - - conc-list: [1, 4, 8, 16, 32, 64] + # Six 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm + # aflowers/gb200-dsv4-recipes branch, recipes/vllm/deepseek-v4-pro-sa/ + # (the SemiAnalysis-curated subset of PR #77). conc-list values match + # each recipe's benchmark.concurrencies. + + # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP). + # 18 nodes. Multiple TP-only decoders parallelize independent requests. + - conc-list: [1, 8, 16, 32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml" decode: - num-worker: 1 + num-worker: 8 tp: 8 ep: 1 dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). - # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - - conc-list: [128, 256, 1024, 2048, 4096] + # 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes. + - conc-list: [64, 128, 256, 512, 1024] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - # The 4096 overlap with the 1p1d block gives a crossover point. 8192 - # would saturate 1p1d's prefill, so this topology takes over there. - - conc-list: [4096, 8192] + # 1p4d pure-TP decode: 1 prefill (DEP=8) + 4 decode (TP=8). 10 nodes. + - conc-list: [256, 512] prefill: - num-worker: 3 + num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml" decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - - conc-list: [1, 4, 8, 16, 32, 64] + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + # 2p1d DEP-8 decode (c4096): 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. + - conc-list: [4096] prefill: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml" decode: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] + ep: 8 + dp-attn: true + # 3p1d DEP-8 decode (c4096): 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. + - conc-list: [4096] prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [4096, 8192] + # 3p1d wide DEP-16 decode (c4096): 3 prefill (DEP=8) + 1 decode (DEP=16). 10 nodes. + - conc-list: [4096] prefill: - num-worker: 7 + num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml" decode: num-worker: 1 tp: 16 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml deleted file mode 100644 index bf5b441b9..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16" - -# 1k/1k mid-to-high throughput topology. Extrapolated from -# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's -# DP>=8 minimum. Single prefill worker feeding a wide DP=16 decode handles -# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high -# enough at this prompt length; see kimi precedent). -# -# Differences from our 8k1k 7p1d-dep8-dep16: -# * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes -# * max-model-len: 3072 instead of auto -# * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq) -# * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism) -# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512 - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so -# a slow first-time Lustre load + cudagraph capture can't get cut off by the -# SLURM wall clock. -slurm: - time_limit: "8:00:00" - -# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from -# Lustre with multiple workers contending for the same OSTs — previous 1k/1k -# run hit the default 1800s. Make this *very* generous since the cost of an -# over-long deadline is just sitting idle, not wasted compute. -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x1024x2048x4096" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml deleted file mode 100644 index 63e9e280c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,117 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" - -# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those -# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling) -# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s -# exceeds what one DP=8 worker can sustain. -# -# Decode capacity: -# max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which -# leaves headroom over the conc=8192 working set (per-rank avg 512). -# max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is -# ~512 so cudagraphs still apply at steady state. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 1024 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 1024 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml similarity index 58% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index 984c79526..ab6d27cb7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -1,24 +1,28 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" +name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" -# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from -# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only -# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No -# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet. +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml # -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64). +# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes total. -sa +# variant extends concurrencies to 64x128x256x512x1024. # -# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see -# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list. - +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -28,7 +32,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -38,17 +41,13 @@ resources: decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" @@ -57,30 +56,27 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -92,7 +88,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: 3072 + max-model-len: 9280 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -103,42 +99,46 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - # CPU/DRAM expert offload — required for fit. Without these the prefill - # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. + numa-bind: true offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" tokenizer-mode: deepseek_v4 - decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 8 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 + max-model-len: 9280 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - benchmark: type: "sa-bench" - isl: 1024 + isl: 8192 osl: 1024 - concurrencies: "1x4x8x16x32x64" + concurrencies: "64x128x256x512x1024" req_rate: "inf" - use_chat_template: false + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml new file mode 100644 index 000000000..3864fec47 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -0,0 +1,144 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +# +# Topology: 1 prefill (DEP=8) + 4 decode (pure TP=8). 10 nodes. +# Targets c256-c512 with TP-only decoders. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml new file mode 100644 index 000000000..b40f89d1c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml @@ -0,0 +1,144 @@ +name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +# +# Topology: 1 prefill (DEP=8) + 8 decode (pure TP=8). 18 nodes. +# Targets c8-c512 with parallel TP-only decoders. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 16 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x8x16x32x64x128x256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml similarity index 54% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index 0c872e9c4..9848edb01 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -1,37 +1,28 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" +name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" -# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch: -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml # -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64) where TEP-style decode (TP-sharded -# attention + EP'd experts within one worker) gives the best per-user -# latency. +# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. +# c4096-tuned variant (decode max-num-seqs=512). # # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# our launch script's SRT_SLURM_MODEL_PREFIX. -# * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026 -# which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM -# expert offload (offload-group-size/-num-in-group/-prefetch-step) is -# KEPT — it's load-bearing here, see the comment in vllm_config.prefill. -# * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode -# dropped. Both require PR #68 sa-bench tokenizer support that our -# pinned srtctl version doesn't have. The recipe-level -# `tokenizer-mode: deepseek_v4` for workers stays. -# * Container kept on the floating tag (`:deepseekv4-cu130`) instead of -# the upstream sha256 pin. -# * health_check / slurm.time_limit added — we observed cold-cache -# Lustre loads exceeding the default 1800s deadline. - +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -41,27 +32,22 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 4 decode_nodes: 2 - prefill_workers: 1 + prefill_workers: 2 decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,30 +56,27 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -105,7 +88,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: 9280 + max-model-len: 16384 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -116,42 +99,46 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - # CPU/DRAM expert offload — required for fit. Without these the prefill - # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. + numa-bind: true offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" tokenizer-mode: deepseek_v4 - decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 8 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 9280 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1x4x8x16x32x64" + concurrencies: "4096" req_rate: "inf" - use_chat_template: false + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml similarity index 50% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index d6b750bf2..3f3803d3b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -1,20 +1,28 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" - -# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). Targets conc 512-1024 where a single big decode -# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d -# reference (PR #67); only resources, prefill_workers count, and -# benchmark concurrencies differ. Decode capacity matches 7p1d -# (max-num-seqs=256) since the decode topology itself is identical. +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml +# +# Topology: 3 prefill (DEP=8) + 1 wide decode (DEP=16). 10 nodes. +# c4096-tuned variant. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -24,7 +32,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -34,15 +41,12 @@ resources: decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 16 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -50,7 +54,15 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -58,7 +70,13 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -70,17 +88,23 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -90,7 +114,7 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto + max-model-len: 16384 max-num-seqs: 256 max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 @@ -102,11 +126,19 @@ backend: stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - + tokenizer-mode: deepseek_v4 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024" + concurrencies: "4096" req_rate: "inf" - use_chat_template: false + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml new file mode 100644 index 000000000..f3b09e0db --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -0,0 +1,144 @@ +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml +# +# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. +# c4096-tuned variant (decode max-num-seqs=512). +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml deleted file mode 100644 index 6213373b3..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ /dev/null @@ -1,122 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" - -# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra -# benchmark flag: use_chat_template=false. The HF tokenizer for -# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's -# --use-chat-template path calls tokenizer.apply_chat_template() and raises -# ValueError. Throughput benchmarking uses /v1/completions with random tokens -# anyway — no chat template needed. -# -# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a -# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ -# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and -# uses this native formatter — no custom Jinja template required. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads -# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor -# shards with 14 prefill workers contending for the same OSTs. The first -# bump to 7200s was still insufficient in one case, so pad generously to -# 14400s (4h). Over-long deadline only costs idle time, not compute. -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 4 - prefill_workers: 7 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8941211c1..b161e9b95 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1937,3 +1937,12 @@ - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/gb200-dsv4-recipes (PR #77, supersedes #71)" + - "8k/1k search-space expanded from 3 topologies to 8: adds 1p4d/1p8d pure-TP-decode (offload), 1p1d/2p1d/3p1d DEP-8 decode, and a 3p1d-dep16-40 wide-decode shape" + - "Drops local workarounds: numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode are restored now that PR #77 ships vllm_numa_bind_hash_fix.py and sa-bench DSV4 tokenizer support" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 + diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..bbc9b22af 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -43,10 +43,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre - # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the - # model.path alias in our DSV4 recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 recipes. + export MODEL_PATH="/mnt/lustre01/models/deepseek-v4-pro" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" @@ -143,7 +141,7 @@ fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + git checkout aflowers/vllm-gb200-v0.20.0 # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto # it rather than nesting (`cp -r src dst` would create