SemiAnalysisAI · Oseltamivir · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -7714,3 +7714,63 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-sglang:
+  # _arm64 variant: GH runner pod doing `enroot import` is amd64, but
+  # gb300-cw compute nodes are aarch64 (Grace). Without the explicit
+  # arm64 tag the registry serves the amd64 manifest, which fails to
+  # exec on the compute side.
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cw
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  # 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL
+  # KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm-
+  # recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm
+  # checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave
+  # (2x 18-node racks); recipes set their own sbatch_directives.segment
+  # for rack pinning.
+  #
+  # Two search-space bands:
+  #   * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give
+  #     single-user latency floor; 4-128 covers the saturation curve
+  #     mirroring NVIDIA/srt-slurm PR #75.
+  #   * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn +
+  #     DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head
+  #     comparison (find the crossover where DPA beats TP-only); 256-
+  #     1024 extends past the symmetric saturation point (~conc=128 /
+  #     838 Total TPS/GPU per PR #75).
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+    - conc-list: [16, 32, 64, 128, 256, 512, 1024]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -139,3 +139,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/...arks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/...arks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml
@@ -0,0 +1,153 @@
+name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4"
+
+# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology:
+#   Prefill: 1 node, TP=4 (no DP-attn, no EP).
+#   Decode:  1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit).
+# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**.
+#
+# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream
+# PR #75 exactly; this DEP4 variant is a local extension to probe whether
+# decode-side DP-attn + DeepEP unlocks throughput past the symmetric
+# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75).
+#
+# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks
+# see DP=4 replication. SGLang's --disaggregation-decode-tp and
+# --disaggregation-decode-dp flags on the prefill engine carry this
+# metadata so KV chunks route to the correct decode rank during the
+# transfer (server_args.py:643-654, validate_disagg_tp_size).
+#
+# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling.
+# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container
+# regardless of dynamo version (run 24973148979 with hash 6a159fed +
+# prebuild cache still hit the same watchdog timeout). PR #75 calls
+# out Mooncake as the working transport for state buffers.
+#
+# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same
+# way as the symmetric sibling.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
+  precision: "fp4"
+
+# Build dynamo from hash 6a159fed via prebuild cache. See the TP4
+# sibling header for the full rationale and the casualty timeline —
+# short version: arm64 container ships no ai-dynamo, dev wheels API-
+# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror
+# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall
+# from /mnt/vast/dynamo_cache/<hash> per rank.
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    # MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn +
+    # EP is enabled. Mirror gen_launch.py medium/large defaults.
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+      # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so
+      # KV chunks route to the correct decode rank during NIXL transfer.
+      disaggregation-decode-tp: 4
+      disaggregation-decode-dp: 4
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      # Full DP-attn on 4 GPUs: each rank is its own DP unit for
+      # attention; MoE is sharded across EP (ep_size = tp_size = 4
+      # implicit when --moe-a2a-backend deepep).
+      enable-dp-attention: true
+      dp-size: 4
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  # Conc 16-128 overlaps the TP4 sibling for head-to-head comparison
+  # (where does decode-side DPA start beating TP-only?); 256-1024
+  # probes throughput past the symmetric saturation point.
+  concurrencies: "16x32x64x128x256x512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml
@@ -0,0 +1,159 @@
+name: "dsv4-sglang-disagg-gb300-1p1d-tp4"
+
+# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang +
+# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75
+# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of
+# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree.
+#
+# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at
+# 1P1D-TP4 fits trivially within a single rack; the explicit segment
+# below pins them so the KV transfer between prefill and decode stays
+# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment
+# so each recipe owns its own value.)
+#
+# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300
+# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched
+# below — see "Transport: Mooncake"). PR #75 measures saturation at
+# conc=128 / 838 Total TPS/GPU; sweep capped accordingly.
+#
+# Transport: Mooncake (not NIXL).
+#   * NIXL hung the prefill startup warmup indefinitely on this stack
+#     (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the
+#     DSv4 formatter — compat shim warns on every worker, then a
+#     4-token warmup probe never runs forward). See runs through
+#     2026-04-27 ~02:35 (gh actions 24973148979) for the exact
+#     watchdog trace.
+#   * PR #75 explicitly notes "Mooncake handles state buffers
+#     correctly" — the disagg accuracy bug it warns about is NIXL-
+#     specific, and switching to Mooncake side-steps both that bug
+#     and our warmup hang.
+#   * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container
+#     ships the Mooncake transport built-in; no extra deps needed.
+#
+# Local deltas vs upstream PR #75:
+#   * benchmark.type = sa-bench (upstream also uses sa-bench in the
+#     latest revision; matches).
+#   * sbatch_directives.segment + mem: rack-pinning for cw, mirroring
+#     the dynamo-vllm gb300 recipe convention. Upstream targets a
+#     different cluster and doesn't need this.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64"
+  precision: "fp4"
+
+# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling
+# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace-
+# blackwell_arm64 image lacks both a working ai-dynamo and the rust
+# toolchain for an in-container build; pinning a published dev wheel
+# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat
+# shim warns then disagg startup warmup hangs). Same prebuild-cache
+# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel
+# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/<hash>,
+# and the setup_script below force-reinstalls from cache per rank
+# (~30 s, no per-rank rust build, no API drift).
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  # install: false → srtctl skips its own pip install; setup_script is
+  # the sole installer.
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+# Pin both nodes (1P + 1D) to the same rack on cw. Without this they
+# can land on different racks and pay the cross-rack hop on every KV
+# transfer.
+sbatch_directives:
+  segment: "2"
+  # Use all node memory; cw default is too tight for the MXFP4 worker.
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  # Low-latency band — TP4 1P1D saturates near conc=128 on GB300
+  # (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give
+  # single-user latency floor reference; 4-128 covers the saturation
+  # curve. For high-conc Pareto use the DEP variants.
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
+  use_chat_template: false