Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
8242762
gb300 1k1k sglang
Oseltamivir Apr 26, 2026
ba062c0
route gb300 sglang to cw cluster
Oseltamivir Apr 26, 2026
4f7d3bc
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
c21afd3
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7903970
connector
Oseltamivir Apr 26, 2026
26943f7
path
Oseltamivir Apr 26, 2026
e7b58f7
drop forced dynamo 0.8.1 install — use container-bundled dynamo for D…
Oseltamivir Apr 26, 2026
74d8307
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7f38f8c
Merge remote-tracking branch 'origin/main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
fa52ab0
match upstream PR #75 tunings + skip srtctl dynamo install
Oseltamivir Apr 26, 2026
bc80a16
add flags
hnyls2002 Apr 26, 2026
7f43185
add more selection space
hnyls2002 Apr 26, 2026
afca046
use _arm64 image tag + squash_dupe dir for gb300-cw
Oseltamivir Apr 27, 2026
3882a55
pin dynamo to 1.2.0.dev20260426 — first arm64 wheel with DSv4 formatter
Oseltamivir Apr 27, 2026
77bbcb8
step back to dynamo dev20260425 — earlier wheel may align with contai…
Oseltamivir Apr 27, 2026
d7dc646
prebuild dynamo wheel from hash 6a159fed on /mnt/vast — mirror PR #11…
Oseltamivir Apr 27, 2026
56b64e8
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
5e3340c
switch disagg transport nixl → mooncake
Oseltamivir Apr 27, 2026
83867ea
strip return_routed_experts kwarg from dynamo call sites — sglang 0.5…
Oseltamivir Apr 27, 2026
3efc208
fix dynamo regex: only match whole-line kwarg passes, leave assignmen…
Oseltamivir Apr 27, 2026
9a4018c
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7714,3 +7714,63 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
# _arm64 variant: GH runner pod doing `enroot import` is amd64, but
# gb300-cw compute nodes are aarch64 (Grace). Without the explicit
# arm64 tag the registry serves the amd64 manifest, which fails to
# exec on the compute side.
image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-cw
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
# 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL
# KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm-
# recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm
# checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave
# (2x 18-node racks); recipes set their own sbatch_directives.segment
# for rack pinning.
#
# Two search-space bands:
# * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give
# single-user latency floor; 4-128 covers the saturation curve
# mirroring NVIDIA/srt-slurm PR #75.
# * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn +
# DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head
# comparison (find the crossover where DPA beats TP-only); 256-
# 1024 extends past the symmetric saturation point (~conc=128 /
# 838 Total TPS/GPU per PR #75).
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [16, 32, 64, 128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,8 @@ gb300:
- 'gb300-nv_0'
- 'gb300-nv_1'
- 'gb300-nv_2'
gb300-cw:
- 'gb300-cw_0'
- 'gb300-cw_1'
- 'gb300-cw_2'
- 'gb300-cw_3'
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"

# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology:
# Prefill: 1 node, TP=4 (no DP-attn, no EP).
# Decode: 1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit).
# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**.
#
# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream
# PR #75 exactly; this DEP4 variant is a local extension to probe whether
# decode-side DP-attn + DeepEP unlocks throughput past the symmetric
# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75).
#
# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks
# see DP=4 replication. SGLang's --disaggregation-decode-tp and
# --disaggregation-decode-dp flags on the prefill engine carry this
# metadata so KV chunks route to the correct decode rank during the
# transfer (server_args.py:643-654, validate_disagg_tp_size).
#
# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling.
# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container
# regardless of dynamo version (run 24973148979 with hash 6a159fed +
# prebuild cache still hit the same watchdog timeout). PR #75 calls
# out Mooncake as the working transport for state buffers.
#
# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same
# way as the symmetric sibling.

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
precision: "fp4"

# Build dynamo from hash 6a159fed via prebuild cache. See the TP4
# sibling header for the full rationale and the casualty timeline —
# short version: arm64 container ships no ai-dynamo, dev wheels API-
# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror
# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall
# from /mnt/vast/dynamo_cache/<hash> per rank.
dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: false

setup_script: gb300-cw-sglang-container-deps.sh

extra_mount:
- "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"

sbatch_directives:
segment: "2"
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang

prefill_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"

decode_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
# MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn +
# EP is enabled. Mirror gen_launch.py medium/large defaults.
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake
# Decode runs DP-attn TP=4 / DP=4; prefill must be told both so
# KV chunks route to the correct decode rank during NIXL transfer.
disaggregation-decode-tp: 4
disaggregation-decode-dp: 4
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 8192
disable-radix-cache: true

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
# Full DP-attn on 4 GPUs: each rank is its own DP unit for
# attention; MoE is sharded across EP (ep_size = tp_size = 4
# implicit when --moe-a2a-backend deepep).
enable-dp-attention: true
dp-size: 4
moe-a2a-backend: deepep
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 512
cuda-graph-max-bs: 512
chunked-prefill-size: 8192
disable-radix-cache: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
# Conc 16-128 overlaps the TP4 sibling for head-to-head comparison
# (where does decode-side DPA start beating TP-only?); 256-1024
# probes throughput past the symmetric saturation point.
concurrencies: "16x32x64x128x256x512x1024"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
name: "dsv4-sglang-disagg-gb300-1p1d-tp4"

# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang +
# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75
# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of
# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree.
#
# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at
# 1P1D-TP4 fits trivially within a single rack; the explicit segment
# below pins them so the KV transfer between prefill and decode stays
# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment
# so each recipe owns its own value.)
#
# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched
# below — see "Transport: Mooncake"). PR #75 measures saturation at
# conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
#
# Transport: Mooncake (not NIXL).
# * NIXL hung the prefill startup warmup indefinitely on this stack
# (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the
# DSv4 formatter — compat shim warns on every worker, then a
# 4-token warmup probe never runs forward). See runs through
# 2026-04-27 ~02:35 (gh actions 24973148979) for the exact
# watchdog trace.
# * PR #75 explicitly notes "Mooncake handles state buffers
# correctly" — the disagg accuracy bug it warns about is NIXL-
# specific, and switching to Mooncake side-steps both that bug
# and our warmup hang.
# * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container
# ships the Mooncake transport built-in; no extra deps needed.
#
# Local deltas vs upstream PR #75:
# * benchmark.type = sa-bench (upstream also uses sa-bench in the
# latest revision; matches).
# * sbatch_directives.segment + mem: rack-pinning for cw, mirroring
# the dynamo-vllm gb300 recipe convention. Upstream targets a
# different cluster and doesn't need this.

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
precision: "fp4"

# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling
# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace-
# blackwell_arm64 image lacks both a working ai-dynamo and the rust
# toolchain for an in-container build; pinning a published dev wheel
# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat
# shim warns then disagg startup warmup hangs). Same prebuild-cache
# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel
# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/<hash>,
# and the setup_script below force-reinstalls from cache per rank
# (~30 s, no per-rank rust build, no API drift).
dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
# install: false → srtctl skips its own pip install; setup_script is
# the sole installer.
install: false

setup_script: gb300-cw-sglang-container-deps.sh

# Mount /mnt/vast/dynamo_cache into every worker container so each
# rank can pip-install from the wheel that launch_gb300-cw.sh
# pre-built there.
extra_mount:
- "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"

# Pin both nodes (1P + 1D) to the same rack on cw. Without this they
# can land on different racks and pay the cross-rack hop on every KV
# transfer.
sbatch_directives:
segment: "2"
# Use all node memory; cw default is too tight for the MXFP4 worker.
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang

prefill_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"

decode_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 8192
disable-radix-cache: true

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 8192
disable-radix-cache: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
# Low-latency band — TP4 1P1D saturates near conc=128 on GB300
# (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give
# single-user latency floor reference; 4-128 covers the saturation
# curve. For high-conc Pareto use the DEP variants.
concurrencies: "1x2x4x8x16x32x64x128"
req_rate: "inf"
use_chat_template: false
Loading
Loading