Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
93db2e2
Day 0 DeepSeek V4 Pro FP4 GB200 disaggregated SGLang benchmarks
Oseltamivir Apr 25, 2026
1bc4c2e
Drop unsupported backend.connector field from sglang recipes
Oseltamivir Apr 25, 2026
c0d477d
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 25, 2026
65b8b17
Drop dynamo: version: 0.8.1 — incompatible with deepseek-v4-grace-bla…
Oseltamivir Apr 25, 2026
9d883ba
Add dynamo: install: false — srtctl default is install=True
Oseltamivir Apr 25, 2026
1b75dd7
Pin dynamo to v1.2.0-sglang-deepseek-v4-dev.1 tag (hash 21f135f5)
Oseltamivir Apr 25, 2026
eb3f62c
Force deepep-mode: low_latency to work around mxfp4+DeepEP normal-dis…
Oseltamivir Apr 25, 2026
6c608df
Drop DeepEP / DP-attn / EP — fork-only mxfp4_deepseek bug, both dispa…
Oseltamivir Apr 25, 2026
2bb3ef0
Add moe-dense-tp-size: 1 — fix shared-experts FP8 block-quant divisib…
Oseltamivir Apr 25, 2026
d34d894
Set SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 in all env bl…
Oseltamivir Apr 25, 2026
c24f25b
Switch to TP=4 single-node — match PR #75 verbatim, fix FP8 block-quant
Oseltamivir Apr 25, 2026
c0aec93
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 25, 2026
8316d3f
Restore mi355x retry changelog entries clobbered by merge
Oseltamivir Apr 25, 2026
f089567
Switch back to TP=8: enable-dp-attention + moe-dense-tp-size: 1, no m…
Oseltamivir Apr 26, 2026
34e4a92
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
5b6eb2f
Scope sweep to high-conc DeepEP only — temporarily comment 1p1d blocks
Oseltamivir Apr 26, 2026
b913586
tep fix + dep for high conc
Oseltamivir Apr 26, 2026
bca99eb
sike no dpa
Oseltamivir Apr 26, 2026
6c09973
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
5866658
Cap SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK at 1024 — sglang L…
Oseltamivir Apr 26, 2026
c0fc3bb
Revert 3p1d-dep8-dep16 to no-DeepEP TP-only; uncomment full 1k/1k + 8…
Oseltamivir Apr 26, 2026
0526fa0
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
30c2512
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
bc9fccf
Try moe-a2a-backend: flashinfer on 3p1d-dep8-dep16 for high-conc EP
Oseltamivir Apr 27, 2026
8ea8e77
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
e6d8943
Revert flashinfer EP attempt — accept TP-only pareto, every EP backen…
Oseltamivir Apr 27, 2026
90304df
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
1d27533
fix(sglang): bump 8k1k prefill max-running-requests from 4 to 8
Oseltamivir Apr 27, 2026
a172069
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7714,3 +7714,122 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb200-dynamo-sglang:
image: lmsysorg/sglang:deepseek-v4-grace-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
# 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no
# DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's
# sglang fork has a fork-only mxfp4_deepseek kernel that crashes any
# DeepEP forward path (both DeepEPLLDispatchOutput and
# DeepEPNormalDispatchOutput lack the `topk_output` field the kernel
# reads). At TP=8 the shared-experts gate_up_proj would also fail
# FP8 block-quant divisibility (1536/8=192, not divisible by 128)
# unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated
# — and that flag is gated on `enable_dp_attention=True` in sglang
# dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at
# its default `"none"` — sglang `forward_normal` path runs (verified
# in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is
# deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep
# the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4-
# gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with
# all-reduce/all-gather MoE dispatch.
- isl: 1024
osl: 1024
search-space:
# Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
# Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
- conc-list: [128, 256, 1024, 2048, 4096]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
# High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
# 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
- conc-list: [4096, 8192]
prefill:
num-worker: 3
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
# Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
- conc-list: [512, 1024]
prefill:
num-worker: 3
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
# Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes.
- conc-list: [4096, 8192]
prefill:
num-worker: 7
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16"

# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
#
# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes.
# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank
# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing).

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
precision: "fp4"

# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
dynamo:
hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
install: true

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
moe-dense-tp-size: 1
enable-dp-attention: true
dp-size: 8
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 16
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
moe-dense-tp-size: 1
enable-dp-attention: true
dp-size: 8
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 512
cuda-graph-max-bs: 512
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x1024x2048x4096"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"

# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The
# closest references on NVIDIA/srt-slurm are:
# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) —
# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars.
# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) —
# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 +
# chunked-prefill-size=4096 + disable-flashinfer-autotune.
# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
# framework numbers stay directly comparable.
#
# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes.
# Targets very low concurrency (1-64).
#
# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"):
# 1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM.
# TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits.
# 2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork
# ships a fork-only quant kernel `mxfp4_deepseek.py` that reads
# `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput`
# nor `DeepEPNormalDispatchOutput` exposes that field in this
# fork, so `forward_deepep` always crashes the prefill scheduler.
# We must stay off the DeepEP path.
# 3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant
# divisibility (1536/8=192, not divisible by block_n=128).
# `moe-dense-tp-size: 1` runs the dense MLP layers replicated
# (TP=1) so the divisibility check passes — but that flag is
# gated on `enable_dp_attention=True` in sglang
# `python/sglang/srt/layers/dp_attention.py`
# (`compute_dp_attention_local_info` returns the full `tp_size`
# and ignores `moe_dense_tp_size` when DP-attn is off).
# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so
# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set.
# The default `"none"` lands the MoE on `forward_normal` instead of
# `forward_deepep` — verified in deepseek_v2.py:
# `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori|
# is_ascend_fuseep|is_flashinfer` → False with default.

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
precision: "fp4"

# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI
# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in
# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import
# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell
# image's sglang fork does not expose `sgl.Engine`, so they crash at
# import with `AttributeError: module 'sglang' has no attribute
# 'Engine'`. The DSV4-targeted tag adds `from __future__ import
# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the
# annotation lazy so the module imports cleanly.
dynamo:
hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
install: true

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang

# Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline
# that's actually been run upstream) plus the disaggregation timeout
# triple — heartbeat 100k matches the DSR1 sglang disagg convention.
prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
moe-dense-tp-size: 1
enable-dp-attention: true
dp-size: 8
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 16
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
moe-dense-tp-size: 1
enable-dp-attention: true
dp-size: 8
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 64
cuda-graph-max-bs: 64
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x8x16x32x64"
req_rate: "inf"
use_chat_template: false
Loading
Loading