diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..de0db09d5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2461,6 +2461,61 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm. +# 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool). +# Cross-node TP=16 over IB: DSV4-Pro per-rank weight footprint at DP=16/EP=16 +# is 74.99 GiB on H100 80GB (run 24923521075 OOM'd in sparse_attn_indexer +# profile_run with only ~4 GiB headroom). TP=16 shards the model 16-way +# across the 2 nodes, dropping per-rank weights to ~5 GiB. Recipe bundled +# locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed. +dsv4-fp8-h100-dynamo-vllm: + image: vllm/vllm-openai:deepseekv4-cu129 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h100-multinode + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + # 1P+1D TEP=16 across the full 4-node h100-multinode pool. + # Each prefill/decode worker spans 2 nodes via Dynamo's --headless + # secondary-node mode + vLLM's MultiprocExecutor + torch.distributed PG. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + additional-settings: + # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: false + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml new file mode 100644 index 000000000..049fa8218 --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -0,0 +1,183 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) +# +# DSV4-Pro FP8 weights are 74.99 GiB per DP rank — DP=16/EP=16 (one process +# per GPU) leaves only ~4 GiB headroom on H100 80GB and sparse_attn_indexer +# OOMs during profile_run (run 24923521075 hit this on every prefill and +# decode worker simultaneously). Switching to cross-node TP=16 shards +# dense+expert weights 16-way → ~5 GiB/GPU, plenty of headroom. +# +# Layout: +# - 1 prefill endpoint × TP=16 across 2 nodes (16 GPUs) +# - 1 decode endpoint × TP=16 across 2 nodes (16 GPUs) +# - Total: 32 GPUs across 4 nodes (fills h100-multinode pool) +# +# How the cross-node launch works: +# - srt-slurm's standard TP mode (no data-parallel-size) starts one srun +# per node, each process gets all 8 local GPUs. +# - Leader (node_rank=0): full Dynamo + vLLM with `--master-addr +# --nnodes 2 --node-rank 0 --tensor-parallel-size 16`. MultiprocExecutor +# spawns 8 local workers, then waits for the 8 remote workers. +# - Secondary (node_rank=1): adds `--headless` (srt-slurm vllm.py:386-388), +# which routes through dynamo's run_dynamo_headless → vLLM's run_headless → +# MultiprocExecutor(monitor_workers=False) joining the leader's PG over +# torch.distributed (master_addr / master_port). NCCL backs the TP +# all-reduce; on h100-multinode that flows over IB (no NVLink between +# nodes), so per-layer TP comms is the dominant latency cost — accepted +# as the only way to fit DSV4-Pro on 80 GB H100. + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM +# wheel AND that exposes --headless via run_dynamo_headless (see +# components/src/dynamo/vllm/main.py:80). Requires the alec-flowers/srt-slurm +# fork (NVIDIA/srt-slurm#71), which extends the dynamo config to accept +# `hash` as well as `version`. +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +# Bumped from the srt-slurm 6h default. Cross-node TP=16 over IB plus +# cudagraph capture extends post-load init noticeably; cold-cache Lustre +# weight load alone took 24 min in run 24923521075. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24923521075 +# observed full weight load taking ~1450s with three concurrent matrix +# jobs starving the same Lustre OSTs. Cross-node TP setup + indexer +# warm-up adds more on top. Match the GB200 dsv4 recipes; the cost of an +# over-long deadline is sitting idle, not wasted compute. +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL + # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM + # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp + # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is + # the companion flag. + # + # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True reduces fragmentation + # in the tight headroom — even with TP=16 sharding the model 16-way, a + # contiguous 512 MiB indexer scratch can still fail under fragmented + # 80 GB caches. The OOM error message itself recommends this exact flag. + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + vllm_config: + # tensor-parallel-size: 16 across 2 nodes (each srt-slurm srun process + # gets 8 local GPUs; vLLM forms a 16-rank PG via the master_addr + # handshake). data-parallel-size is intentionally absent — its presence + # would push srt-slurm into the per-GPU process layout, which is what + # broke the previous TEP=16 attempt with "World size (16) > available + # GPUs (1) in this node". + # + # enable-expert-parallel keeps experts sharded EP=16 along the same + # 16 ranks (each rank holds 1/16 of the routed experts). Communication + # is dominated by per-layer TP all-reduce + EP all-to-all, both over + # IB. + # + # enforce-eager on both sides for the first cross-node attempt: cudagraph + # capture across nodes is fragile (FULL_DECODE_ONLY graphs include the + # cross-node TP all-reduce) and the previous run's 1.48 GiB private-pool + # accumulation already burned the headroom. Drop graphs to ship; revisit + # for decode performance once the server is observed healthy. + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + # With TP=16 the per-rank model footprint drops from ~75 GiB to + # ~5 GiB, so we can match the H200 single-node 0.95 utilization. + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" + # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The + # server uses --tokenizer-mode deepseek_v4 to handle templating itself, + # but sa-bench's local apply_chat_template path raises ValueError. + # Send raw prompts; the server handles formatting. + use_chat_template: false diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml new file mode 100644 index 000000000..e31e8d531 --- /dev/null +++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml @@ -0,0 +1,120 @@ +# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) — 8k/1k +# +# Same engine flags as the 1k1k variant. Only the benchmark block differs +# (ISL=8192, tighter concurrency sweep due to larger prefill work). +# +# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml +# for the full rationale (cross-node TP=16 layout, MultiprocExecutor + +# torch.distributed PG handshake, --headless secondary node, IB-backed +# NCCL TP all-reduce, deepseek_v4 parsers, NixlConnector). + +name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k" + +model: + path: "dsv4-fp8" + container: "vllm/vllm-openai:deepseekv4-cu129" + precision: "fp8" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "h100" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 16 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: nixl + + prefill_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SKIP_P2P_CHECK: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_SERVER_DEV_MODE: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" + NVIDIA_GDRCOPY: "enabled" + GLOO_SOCKET_IFNAME: "eth0" + PYTHONUNBUFFERED: "1" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + kv-cache-dtype: "fp8" + block-size: 256 + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + tensor-parallel-size: 16 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 512 + max-num-batched-tokens: 512 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 397da6591..e26c5b4f7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,3 +1819,13 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp8-h100-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP8 H100 multinode disagg benchmark via dynamo-vllm" + - "2 prefill nodes + 2 decode nodes (32 H100s total, DP16/EP16 per side)" + - "Image: vllm/vllm-openai:deepseekv4-cu129" + - "Engine flags match H200 single-node recipe (deepseek_v4 tokenizer/parsers, FP8 KV cache, block size 256, prefix caching disabled)" + - "max-model-len 16384 (H100 80GB KV headroom; H200's 800k does not fit across 2 decode nodes)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1142 diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 5a2ab64d2..26eabad89 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -29,8 +29,29 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "Unsupported model prefix/precision for dynamo-trt: $MODEL_PREFIX/$PRECISION" exit 1 fi + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8" + export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro" + export SRT_SLURM_MODEL_PREFIX="dsv4-fp8" + # NVIDIA/srt-slurm@sa-submission-q2-2026 installs ai-dynamo 1.0.1, + # which imports vllm.inputs.data.TokensPrompt — a path the DSV4 + # vLLM wheel has removed. Switch to alec-flowers' fork (head of + # https://github.com/NVIDIA/srt-slurm/pull/71) which supports + # dynamo.hash pinning so the recipe can pick a dynamo commit + # compatible with the DSV4 vllm.inputs layout. Matches PR #1129 + # on GB200. + export SRT_SLURM_REPO_URL="https://github.com/alec-flowers/srt-slurm.git" + export SRT_SLURM_REF="d60e3f1c7921721e52af01afaab59a70a1631106" + else + echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION" + exit 1 + fi + # Verify the weights are staged and log their size (catches partial + # downloads / wrong revisions before we burn 8 min on weight load). + du -sh "$MODEL_PATH" 2>/dev/null || echo "WARNING: could not stat $MODEL_PATH" else - echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" + echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" exit 1 fi @@ -41,9 +62,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + git clone "${SRT_SLURM_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + git checkout "${SRT_SLURM_REF:-sa-submission-q2-2026}" + + # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until + # the upstream PR lands; cp -r merges directories on GNU cp. + LOCAL_RECIPES_DIR="$GITHUB_WORKSPACE/benchmarks/multi_node/srt_slurm_recipes" + if [ -d "$LOCAL_RECIPES_DIR" ]; then + echo "Overlaying local srt-slurm recipes from $LOCAL_RECIPES_DIR" + cp -r "$LOCAL_RECIPES_DIR"/* recipes/ + fi echo "Installing srtctl..." export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" @@ -78,6 +107,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#) CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|') SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh" + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + # vLLM container mapping - IMAGE is a Docker Hub reference (no registry prefix swap) + CONTAINER_KEY="$IMAGE" + SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh" fi export ISL="$ISL" @@ -105,6 +138,7 @@ model_paths: containers: dynamo-trtllm: "${SQUASH_FILE}" dynamo-sglang: "${SQUASH_FILE}" + dynamo-vllm: "${SQUASH_FILE}" nginx-sqsh: "${NGINX_SQUASH_FILE}" latest: "${SQUASH_FILE}" "${CONTAINER_KEY}": "${SQUASH_FILE}" @@ -156,11 +190,21 @@ EOF LOGS_DIR="outputs/$JOB_ID/logs" LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + # Defensive: pre-create the logs subdir so Slurm's #SBATCH --output=... + # /%j/logs/sweep_%j.log can open the target file even on NFS mounts + # where the compute-node Slurm stepd lacks permission to mkdir -p. + mkdir -p "$LOGS_DIR" 2>/dev/null || true + # Wait for log file to appear (also check job is still alive) while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" + scontrol show job "$JOB_ID" | tee "outputs/$JOB_ID/scontrol_show_job.txt" 2>/dev/null + # Preserve sbatch_script.sh, config.yaml, metadata, and any partial + # log so the failure can be diagnosed from the CI artifact. + if [ -d "outputs/$JOB_ID" ]; then + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "outputs/$JOB_ID" . + fi exit 1 fi echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."