diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..74e9eb914 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7714,3 +7714,96 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + # Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA + # 228febcfe9c76347cd619a7622af83ca52ca35a4. 8k/1k only — PR 84 + # publishes 5 recipes spanning low-conc (TP=4 decode) → mid (DP=4/8 + # decode + DP=4 prefill workers) → max (14p1d-dep4-dep16, 18 nodes). + # Each recipe rack-pins via its own sbatch_directives.segment. + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # Low-conc / interactivity: 1 prefill (DP=4 + EP) + 1 decode (TP=4). + # 2 nodes total. Decode is plain TP, no EP/DP. + - conc-list: [4, 8, 16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Mid-low: 1 prefill (DP=4) + 1 decode (DP=4 + EP). 2 nodes total. + # Decode swings to DP+EP at conc 256/512 to spread the MoE experts. + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + # Mid-high: 6 prefills (DP=4 each) + 1 decode (DP=8 + EP). 10 nodes + # per upstream resources block (decode_nodes:4 verbatim from PR 84). + - conc-list: [1024, 2048] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # High: 12 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 16 nodes. + - conc-list: [3072, 4096] + prefill: + num-worker: 12 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max: 14 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 18 nodes + # — fills exactly one cw rack. + - conc-list: [6144, 8192] + prefill: + num-worker: 14 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 60f3299cf..f574c629c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -139,3 +139,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml new file mode 100644 index 000000000..4e392d943 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml @@ -0,0 +1,137 @@ +name: "dsv4-vllm-disagg-gb300-12p1d-dep4-dep16" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. High 8k/1k: +# 12 prefills (DP=4 each) + 1 wide decode (DP=16). 16 nodes total. +# Fits within one cw rack (18 nodes). + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "16" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 12 + decode_nodes: 4 + prefill_workers: 12 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.92 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072x4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml new file mode 100644 index 000000000..964730f79 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml @@ -0,0 +1,137 @@ +name: "dsv4-vllm-disagg-gb300-14p1d-dep4-dep16" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Max 8k/1k: +# 14 prefills (DP=4 each) + 1 wide decode (DP=16). 18 nodes total — +# fills exactly one cw rack. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "18" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 14 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.92 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "6144x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml new file mode 100644 index 000000000..3b30212ad --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml @@ -0,0 +1,138 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep4-dep4" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid 8k/1k: +# 1 prefill (DP=4 on 1 node) + 1 decode (DP=4 on 1 node). 2 nodes total. +# Decode shifts from TP=4 (low conc) to DP=4+EP at conc 256/512 to keep +# the wide MoE expert spread tight. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.91 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml new file mode 100644 index 000000000..bd5f303ba --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml @@ -0,0 +1,137 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep4-tp4" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Low-concurrency 8k/1k: +# 1 prefill (DP=4 on 1 node) + 1 decode (TP=4 on 1 node). 2 nodes total. +# Cluster: gb300-cw (CoreWeave, 2x 18-node racks); pinned to one rack +# via sbatch_directives.segment because cw's srtslurm.yaml turns off +# srtctl's auto-segment. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "2" + # Use full node memory; cw default cgroup is too tight for DSV4 weight load. + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.91 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64x128x256" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml new file mode 100644 index 000000000..b3e9cb523 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml @@ -0,0 +1,138 @@ +name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid-high 8k/1k: +# 6 prefills (DP=4 each, 1 node each) + 1 wide decode (DP=8). 10 nodes +# total per upstream resources block (decode_nodes:4 even though one +# DP=8 worker only needs 2 nodes — preserved verbatim from upstream). + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "10" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.92 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0421c5596..2bf844101 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1928,3 +1928,13 @@ - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 + +- config-keys: + - dsv4-fp4-gb300-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 GB300 sweep on cluster gb300-cw (CoreWeave; 2x 18-node racks)" + - "Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA 228febcf. 5 recipes spanning 8k/1k from c=4 to c=8192: 1p1d-dep4-tp4 (low conc), 1p1d-dep4-dep4 (c512), 6p1d-dep4-dep8 (c2048), 12p1d-dep4-dep16 (c4096), 14p1d-dep4-dep16 (c8192, 18 nodes)" + - "Container pinned to vllm/vllm-openai@sha256:d29a90b1... (cu130 + DSV4). Dynamo via published v1.0.2 wheel (install: true). Per-worker tuning: numa-bind, safetensors-load-strategy: prefetch, weight offload (group-size 3), enable-ep-weight-filter, enable-sleep-mode, all2all-backend: flashinfer_nvlink_one_sided on decode, PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True on prefill" + - "vLLM patches (auto-applied by upstream configs/vllm-container-deps.sh): cumem expandable_segments fix, MegaMoE free-orig (vllm-project/vllm#40860 backport), nvlink one-sided bf16 fix, numa-bind hash fix" + - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each recipe rack-pins via sbatch_directives.segment (cw's srtslurm.yaml turns off srtctl auto-segment)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150 diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100755 index 000000000..569cc28ac --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,305 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave) +# cluster. Mirrors launch_gb200-nv.sh but adjusted for cw's filesystem +# layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe, +# and the SLURM partition is `all`. cw is 2x 18-node racks; srtctl's +# auto-segment is disabled (use_segment_sbatch_directive: false) and each +# recipe pins its own segment via sbatch_directives — the largest +# topology (14p1d-dep4-dep16, 18 nodes) fills exactly one rack. +# +# srt-slurm is checked out at NVIDIA/srt-slurm PR #84 head; that PR ships +# the dynamo 1.0.2 install path + the vLLM patches the new recipes +# require, so we use upstream's configs/vllm-container-deps.sh and +# configs/patches/* unchanged (no local overlay). + +set -x + +if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr. + export MODEL_PATH="/mnt/vast/models/dsv4/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; no separate batch queue. +# Account `cw-sup` is what `sacctmgr show assoc user=$USER` returns on this +# cluster — `benchmark` (inherited from gb200-nv) does not exist here. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env to +# decide which host driver libraries (libcuda.so.1, libnvidia-*.so) to +# mount into the container. cw doesn't set them by default — without them +# the container has no libcuda and `import vllm._C` dies with +# "libcuda.so.1: cannot open shared object file". SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +# The deepseekv4-cu130 vLLM image is pre-staged at /mnt/vast/squash_dupe/ +# (manual upload — enroot import of the ~25 GB image takes too long to +# repeat each run). nginx is small enough to import on-demand into +# /mnt/vast/squash/. +SQUASH_DIR="/mnt/vast/squash" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="/mnt/vast/squash_dupe/vllm_vllm-openai_d29a90b13bb9.sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +if [ ! -f "$SQUASH_FILE" ]; then + echo "ERROR: pre-staged vLLM squash not found at $SQUASH_FILE" >&2 + echo "Re-stage it from docker://$IMAGE or repoint SQUASH_FILE." >&2 + exit 1 +fi +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +# Pin to NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) head SHA. PR 84 +# carries the configs/patches/* (cumem expandable_segments fix, MegaMoE +# free_orig, nvlink one-sided bf16 fix, numa-bind hash fix) and the +# matching configs/vllm-container-deps.sh that wires them up. Released +# dynamo 1.0.2 wheel + sleep-mode + safetensors prefetch make the +# prebuild infrastructure unnecessary, so we use upstream's setup +# script directly — no overlay. +git fetch origin pull/84/head:pr-84 +git checkout 228febcfe9c76347cd619a7622af83ca52ca35a4 +# Use `cp -rT` so if the upstream branch ever ships a stub +# `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it +# rather than nesting (`cp -r src dst` would create +# `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). +mkdir -p recipes/vllm/deepseek-v4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared +# NFS across both. srtctl's slurm template (job_script_minimal.j2) does +# `if ! command -v uv` and skips its own ARM64 install when uv is already +# on PATH; on compute nodes $HOME/.local/bin is on PATH by default, so a +# stray x86 binary at $HOME/.local/bin/uv from this runner shadows the +# template's install and crashes the orchestrator with +# `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +# Sanity: confirm the install landed where we expect, not in $HOME/.local/bin. +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +# Create srtslurm.yaml for srtctl +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +# Use the JOB_ID to find the logs directory +# srtctl creates logs in outputs/JOB_ID/logs/ +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +# Wait for log file to appear (also check job is still alive) +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +# Poll for job completion in background +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +# Stream the log file until job completes (-F follows by name, polls instead of inotify for NFS) +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi