From 8242762a59ca108ad03f10ac46ae3cd9139acc1c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 00:35:40 -0700 Subject: [PATCH 01/16] gb300 1k1k sglang --- .github/configs/nvidia-master.yaml | 33 +++++++ .../1k1k/disagg-gb300-1p1d-tp4.yaml | 99 +++++++++++++++++++ perf-changelog.yaml | 9 ++ runners/launch_gb300-nv.sh | 18 +++- 4 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..b58f27aee 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7666,3 +7666,36 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb300-dynamo-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single + # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe + # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ + # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-nv.sh. + # DEP/TEP variants are upstream follow-ups; mirror that and ship 1P1D + # only here. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [1, 4, 16, 64, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml new file mode 100644 index 000000000..307298449 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -0,0 +1,99 @@ +name: "dsv4-sglang-disagg-gb300-1p1d-tp4" + +# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang + +# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75 +# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of +# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree. +# +# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300 +# (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under +# moderate-to-high concurrency. +# +# Local deltas vs upstream PR #75: +# * benchmark.type = sa-bench (upstream uses "manual" because they pair +# with a separate sa-bench launcher; our sweep harness drives sa-bench +# in-recipe). +# * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the +# GB200 sglang sibling — same handshake-stability rationale. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x16x64x256" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ed3c16ff..1807d37d2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1833,3 +1833,12 @@ - "Bump --chunked-prefill-size from 4096 to 8192" - "Retrigger dsv4-fp8-mi355x-sglang" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell" + - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer" + - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 5f48ddcec..ba888c10a 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -18,8 +18,15 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp8" export MODEL_PATH=/raid/shared/models/deepseek-r1-0528 export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" +elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSv4 + # sglang recipes (benchmarks/multi_node/srt-slurm-recipes/sglang/ + # deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml). + export SERVED_MODEL_NAME="deepseek-v4-pro" + export MODEL_PATH=/raid/shared/models/deepseek-v4-pro + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4" exit 1 fi @@ -47,6 +54,15 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 +# Overlay our hand-rolled DSv4 sglang recipes on top of the upstream tree. +# NVIDIA/srt-slurm has no upstream sglang DSv4 disagg recipe for GB300 +# beyond PR #75's 1P1D-TP4 entry, so we ship the recipe locally and copy +# it in here. Mirrors the equivalent block in launch_gb200-nv.sh. +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +fi + echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" curl -LsSf https://astral.sh/uv/install.sh | sh From ba062c0f94277b89d6a0cea1b4999f7944218103 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:54:21 -0700 Subject: [PATCH 02/16] route gb300 sglang to cw cluster --- .github/configs/nvidia-master.yaml | 9 +- .github/configs/runners.yaml | 5 + .../1k1k/disagg-gb300-1p1d-tp4.yaml | 16 ++ runners/launch_gb300-cw.sh | 267 ++++++++++++++++++ runners/launch_gb300-nv.sh | 18 +- 5 files changed, 294 insertions(+), 21 deletions(-) create mode 100755 runners/launch_gb300-cw.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b58f27aee..338db42b2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7671,7 +7671,7 @@ dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300 + runner: gb300-cw precision: fp4 framework: dynamo-sglang multinode: true @@ -7679,9 +7679,10 @@ dsv4-fp4-gb300-dynamo-sglang: # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ - # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-nv.sh. - # DEP/TEP variants are upstream follow-ups; mirror that and ship 1P1D - # only here. + # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-cw.sh. + # Cluster gb300-cw is CoreWeave (2x 18-node racks); recipe sets its + # own sbatch_directives.segment for rack pinning. DEP/TEP variants + # are upstream follow-ups; mirror that and ship 1P1D only here. seq-len-configs: - isl: 1024 osl: 1024 diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 693bb4561..4ce8d2fcb 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -131,3 +131,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 307298449..68e96edeb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -5,6 +5,12 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4" # (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of # the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree. # +# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at +# 1P1D-TP4 fits trivially within a single rack; the explicit segment +# below pins them so the NIXL KV transfer between prefill and decode +# stays rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto +# segment so each recipe owns its own value.) +# # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300 # (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under # moderate-to-high concurrency. @@ -15,6 +21,8 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4" # in-recipe). # * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the # GB200 sglang sibling — same handshake-stability rationale. +# * sbatch_directives.segment + mem: rack-pinning for cw, mirroring the +# dynamo-vllm gb300 recipe convention. model: path: "deepseek-v4-pro" @@ -24,6 +32,14 @@ model: dynamo: version: 0.8.1 +# Pin both nodes (1P + 1D) to the same rack on cw. Without this they +# can land on different racks and pay the cross-rack hop on every NIXL +# KV transfer. +sbatch_directives: + segment: "2" + # Use all node memory; cw default is too tight for the MXFP4 worker. + mem: "0" + slurm: time_limit: "8:00:00" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100755 index 000000000..582a2a342 --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,267 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw +# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in +# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). Compared to +# that script, the SGLang flow is simpler: no dynamo wheel prebuild and +# no vllm-container-deps.sh override, because the SGLang recipes pin +# `dynamo.version: 0.8.1` and srtctl pip-installs from PyPI per rank. + +set -x + +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local + # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in + # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/. + export MODEL_PATH="/mnt/vast/models/dsv4/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; account `cw-sup` is +# what `sacctmgr show assoc user=$USER` returns there. `benchmark` +# (inherited from gb200-nv) does not exist on cw. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env +# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so) +# to mount into the container. cw doesn't set them by default — without +# them the container has no libcuda and CUDA init fails. SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +SQUASH_DIR="/mnt/vast/squash" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +enroot import -o $SQUASH_FILE docker://$IMAGE +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits +# hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout sa-submission-q2-2026 + +# Overlay our hand-rolled DSv4 SGLang recipes. NVIDIA/srt-slurm has no +# upstream sglang DSv4 disagg recipe yet beyond PR #75's 1P1D-TP4 +# entry, so we ship the recipe locally and copy it in here. `cp -rT` +# overlays onto a possibly-existing upstream stub instead of nesting. +mkdir -p recipes/sglang/deepseek-v4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is +# shared NFS across both. srtctl's slurm template (job_script_minimal.j2) +# does `if ! command -v uv` and skips its own ARM64 install when uv is +# already on PATH; on compute nodes $HOME/.local/bin is on PATH by +# default, so a stray x86 binary at $HOME/.local/bin/uv from this +# runner shadows the template's install and crashes the orchestrator +# with `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + CONFIG_NAME=$(basename "$result_subdir") + + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index ba888c10a..5f48ddcec 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -18,15 +18,8 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp8" export MODEL_PATH=/raid/shared/models/deepseek-r1-0528 export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" -elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSv4 - # sglang recipes (benchmarks/multi_node/srt-slurm-recipes/sglang/ - # deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml). - export SERVED_MODEL_NAME="deepseek-v4-pro" - export MODEL_PATH=/raid/shared/models/deepseek-v4-pro - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" exit 1 fi @@ -54,15 +47,6 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 -# Overlay our hand-rolled DSv4 sglang recipes on top of the upstream tree. -# NVIDIA/srt-slurm has no upstream sglang DSv4 disagg recipe for GB300 -# beyond PR #75's 1P1D-TP4 entry, so we ship the recipe locally and copy -# it in here. Mirrors the equivalent block in launch_gb200-nv.sh. -if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then - mkdir -p recipes/sglang/deepseek-v4 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 -fi - echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" curl -LsSf https://astral.sh/uv/install.sh | sh From 79039709ac4d8964e2fe49b476f67218f4f4aa37 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 10:21:04 -0700 Subject: [PATCH 03/16] connector --- .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 68e96edeb..c563ef45a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -63,7 +63,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" From 26943f799e74e29f2ec1959b005e5072afaf4087 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 10:56:23 -0700 Subject: [PATCH 04/16] path --- runners/launch_gb300-cw.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 582a2a342..82cb8d35e 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -43,6 +43,14 @@ mkdir -p "$SQUASH_DIR" SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +# Some images were imported with '+' separators (enroot's default) rather +# than '_'. Check for the '+' variant and symlink so both names resolve. +SQUASH_FILE_PLUS="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh" +if [ ! -f "$SQUASH_FILE" ] && [ -f "$SQUASH_FILE_PLUS" ]; then + ln -sf "$SQUASH_FILE_PLUS" "$SQUASH_FILE" + echo "[squash] symlinked $SQUASH_FILE -> $SQUASH_FILE_PLUS" +fi + enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE From e7b58f72bd4f4f5e37f4b18fbb47ec1f35120bf5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 13:41:20 -0700 Subject: [PATCH 05/16] =?UTF-8?q?drop=20forced=20dynamo=200.8.1=20install?= =?UTF-8?q?=20=E2=80=94=20use=20container-bundled=20dynamo=20for=20DSv4=20?= =?UTF-8?q?formatter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index c563ef45a..d268ec7b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -29,8 +29,16 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# NB: no `dynamo:` block. Mirroring upstream PR #75: rely on whatever +# dynamo the lmsysorg/sglang:deepseek-v4-grace-blackwell container ships. +# That container's bundled dynamo has the native DeepSeekV4Formatter (added +# at hash 6a159fed, 2026-04-23 — see comment in the gb200 vllm sibling +# disagg-gb200-7p1d-dep8-dep16.yaml) which auto-detects DSv4 by model name +# and serves /v1/completions without needing chat_template in +# tokenizer_config.json. Forcing dynamo.version=0.8.1 made srtctl pip-install +# an older release on top of the container that *did* require chat_template, +# and the frontend then 404'd: PromptFormatter.from_mdc rejected the model +# at pipeline build time. Run #24963242956 was the casualty. # Pin both nodes (1P + 1D) to the same rack on cw. Without this they # can land on different racks and pay the cross-rack hop on every NIXL From fa52ab060f35edb2aef3aa8783127350f7c830d0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 14:44:01 -0700 Subject: [PATCH 06/16] match upstream PR #75 tunings + skip srtctl dynamo install --- .github/configs/nvidia-master.yaml | 2 +- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 88 ++++++++++++------- 2 files changed, 55 insertions(+), 35 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 56c23b5d4..6778a25d3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7710,7 +7710,7 @@ dsv4-fp4-gb300-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - - conc-list: [1, 4, 16, 64, 256] + - conc-list: [4, 8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index d268ec7b6..516a14169 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -12,33 +12,53 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4" # segment so each recipe owns its own value.) # # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300 -# (4 GPUs / node). KV transfer over NIXL. Targets steady decode TPOT under -# moderate-to-high concurrency. +# (4 GPUs / node). KV transfer over NIXL. PR #75 measures saturation +# at conc=128 / 838 Total TPS/GPU; sweep capped accordingly. +# +# ⚠️ NIXL state-buffer-transfer accuracy bug (upstream PR #75 body): +# the SGLang NIXL backend currently registers and transfers the KV cache +# correctly but DROPS the model's auxiliary state buffer (SWA / NSA / +# Mamba). On DSv4-Pro this collapses GSM8K from 1.000 (agg) to ~0.13 +# (disagg) while throughput numbers and KV byte hashes look healthy. +# Mooncake handles state buffers correctly; the NIXL fix mirrors that +# (~237 lines extending KVArgsRegisterInfo/TransferInfo/register_buffer_ +# to_engine + adding send_state in +# python/sglang/srt/disaggregation/nixl/conn.py). Until the upstream +# sglang fix lands, the patch must be picked up via the +# lmsysorg/sglang:deepseek-v4-grace-blackwell container build. If +# eval-only GSM8K runs come back near 0.13 with healthy throughput, +# that's the cause — not a tuning issue. # # Local deltas vs upstream PR #75: -# * benchmark.type = sa-bench (upstream uses "manual" because they pair -# with a separate sa-bench launcher; our sweep harness drives sa-bench -# in-recipe). -# * Disagg timeout triple + NCCL_MNNVL/CUMEM env vars copied from the -# GB200 sglang sibling — same handshake-stability rationale. -# * sbatch_directives.segment + mem: rack-pinning for cw, mirroring the -# dynamo-vllm gb300 recipe convention. +# * benchmark.type = sa-bench (upstream also uses sa-bench in the +# latest revision; matches). +# * sbatch_directives.segment + mem: rack-pinning for cw, mirroring +# the dynamo-vllm gb300 recipe convention. Upstream targets a +# different cluster and doesn't need this. model: path: "deepseek-v4-pro" container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# NB: no `dynamo:` block. Mirroring upstream PR #75: rely on whatever -# dynamo the lmsysorg/sglang:deepseek-v4-grace-blackwell container ships. -# That container's bundled dynamo has the native DeepSeekV4Formatter (added -# at hash 6a159fed, 2026-04-23 — see comment in the gb200 vllm sibling -# disagg-gb200-7p1d-dep8-dep16.yaml) which auto-detects DSv4 by model name -# and serves /v1/completions without needing chat_template in -# tokenizer_config.json. Forcing dynamo.version=0.8.1 made srtctl pip-install -# an older release on top of the container that *did* require chat_template, -# and the frontend then 404'd: PromptFormatter.from_mdc rejected the model -# at pipeline build time. Run #24963242956 was the casualty. +# Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4- +# grace-blackwell image ships a dynamo build with the native Rust +# DeepSeekV4Formatter (added at hash 6a159fed, 2026-04-23 — see comment +# in the gb200 vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). That +# formatter auto-detects DSv4 by model name and serves /v1/completions +# without needing chat_template in tokenizer_config.json. +# +# `install: false` is critical here — without it, srtctl's schema +# default (install: True, version: "0.8.0", see srt-slurm/src/srtctl/ +# core/schema.py:697) pip-installs ai-dynamo==0.8.0 from PyPI on top +# of the container, which predates the DSv4 formatter. The frontend +# then 404s on every request: PromptFormatter.from_mdc rejects the +# model at pipeline build time with "chat_template field is required +# in the tokenizer_config.json file". Casualties: runs #24963242956 +# (had dynamo.version: 0.8.1 explicit) and the follow-up (no dynamo +# block, fell through to the 0.8.0 default). +dynamo: + install: false # Pin both nodes (1P + 1D) to the same rack on cw. Without this they # can land on different racks and pay the cross-rack hop on every NIXL @@ -73,22 +93,10 @@ backend: type: sglang prefill_environment: - PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" decode_environment: - PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" sglang_config: prefill: @@ -99,8 +107,12 @@ backend: disaggregation-mode: "prefill" disaggregation-transfer-backend: nixl moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -110,13 +122,21 @@ backend: disaggregation-mode: "decode" disaggregation-transfer-backend: nixl moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x4x16x64x256" + random_range_ratio: 0.8 + # Low-latency band only — TP4 1P1D saturates near conc=128 on GB300 + # (PR #75 verified: 838 Total TPS/GPU at conc=128). For high-conc + # Pareto use the DEP variants (not in this PR). + concurrencies: "4x8x16x32x64x128" req_rate: "inf" use_chat_template: false From bc80a16b773522746d8621c662f1bdb9d6ef8f04 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sun, 26 Apr 2026 15:05:51 -0700 Subject: [PATCH 07/16] add flags --- .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 516a14169..934ee1d75 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -94,9 +94,19 @@ backend: prefill_environment: SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" decode_environment: SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" sglang_config: prefill: From 7f431858a2601fff0648a4f3b2a6a4679356290e Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sun, 26 Apr 2026 15:26:24 -0700 Subject: [PATCH 08/16] add more selection space --- .github/configs/nvidia-master.yaml | 38 ++++- .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 141 ++++++++++++++++++ .../1k1k/disagg-gb300-1p1d-tp4.yaml | 9 +- 3 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6778a25d3..3e62175d5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7699,18 +7699,27 @@ dsv4-fp4-gb300-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - # Ported from NVIDIA/srt-slurm PR #75 — 1P + 1D, both TP=4 on a single - # GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL KV transfer. Recipe - # staged at benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ - # 1k1k/ and overlaid into the srt-slurm checkout by launch_gb300-cw.sh. - # Cluster gb300-cw is CoreWeave (2x 18-node racks); recipe sets its - # own sbatch_directives.segment for rack pinning. DEP/TEP variants - # are upstream follow-ups; mirror that and ship 1P1D only here. + # 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL + # KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm- + # recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm + # checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave + # (2x 18-node racks); recipes set their own sbatch_directives.segment + # for rack pinning. + # + # Two search-space bands: + # * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give + # single-user latency floor; 4-128 covers the saturation curve + # mirroring NVIDIA/srt-slurm PR #75. + # * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn + + # DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head + # comparison (find the crossover where DPA beats TP-only); 256- + # 1024 extends past the symmetric saturation point (~conc=128 / + # 838 Total TPS/GPU per PR #75). seq-len-configs: - isl: 1024 osl: 1024 search-space: - - conc-list: [4, 8, 16, 32, 64, 128] + - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 4 @@ -7723,3 +7732,16 @@ dsv4-fp4-gb300-dynamo-sglang: tp: 4 ep: 1 dp-attn: false + - conc-list: [16, 32, 64, 128, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml new file mode 100644 index 000000000..c79cebc4c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -0,0 +1,141 @@ +name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4" + +# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology: +# Prefill: 1 node, TP=4 (no DP-attn, no EP). +# Decode: 1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit). +# Both on a single GB300 (4 GPUs / node). KV transfer over NIXL. +# +# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream +# PR #75 exactly; this DEP4 variant is a local extension to probe whether +# decode-side DP-attn + DeepEP unlocks throughput past the symmetric +# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75). +# +# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks +# see DP=4 replication. SGLang's --disaggregation-decode-tp and +# --disaggregation-decode-dp flags on the prefill engine carry this +# metadata so KV chunks route to the correct decode rank during NIXL +# transfer (server_args.py:643-654, validate_disagg_tp_size). +# +# Same NIXL state-buffer-transfer caveat as the TP4 sibling - see +# disagg-gb300-1p1d-tp4.yaml header. The grace-blackwell image build +# carries the patch. +# +# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same +# way as the symmetric sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# Use the container-bundled dynamo (skip srtctl pip install). Same +# rationale as the TP4 sibling - see its header for the casualty list. +dynamo: + install: false + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + # MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn + + # EP is enabled. Mirror gen_launch.py medium/large defaults. + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so + # KV chunks route to the correct decode rank during NIXL transfer. + disaggregation-decode-tp: 4 + disaggregation-decode-dp: 4 + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + # Full DP-attn on 4 GPUs: each rank is its own DP unit for + # attention; MoE is sharded across EP (ep_size = tp_size = 4 + # implicit when --moe-a2a-backend deepep). + enable-dp-attention: true + dp-size: 4 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 8192 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + # Conc 16-128 overlaps the TP4 sibling for head-to-head comparison + # (where does decode-side DPA start beating TP-only?); 256-1024 + # probes throughput past the symmetric saturation point. + concurrencies: "16x32x64x128x256x512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 934ee1d75..86319edc0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -144,9 +144,10 @@ benchmark: isl: 1024 osl: 1024 random_range_ratio: 0.8 - # Low-latency band only — TP4 1P1D saturates near conc=128 on GB300 - # (PR #75 verified: 838 Total TPS/GPU at conc=128). For high-conc - # Pareto use the DEP variants (not in this PR). - concurrencies: "4x8x16x32x64x128" + # Low-latency band — TP4 1P1D saturates near conc=128 on GB300 + # (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give + # single-user latency floor reference; 4-128 covers the saturation + # curve. For high-conc Pareto use the DEP variants. + concurrencies: "1x2x4x8x16x32x64x128" req_rate: "inf" use_chat_template: false From afca046a207c10117b732e1856e330e802fbcfec Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 17:20:30 -0700 Subject: [PATCH 09/16] use _arm64 image tag + squash_dupe dir for gb300-cw --- .github/configs/nvidia-master.yaml | 6 +++++- .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 2 +- .../deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 2 +- perf-changelog.yaml | 2 +- runners/launch_gb300-cw.sh | 13 ++++--------- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3e62175d5..2f0e63f53 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7691,7 +7691,11 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-sglang: - image: lmsysorg/sglang:deepseek-v4-grace-blackwell + # _arm64 variant: GH runner pod doing `enroot import` is amd64, but + # gb300-cw compute nodes are aarch64 (Grace). Without the explicit + # arm64 tag the registry serves the amd64 manifest, which fails to + # exec on the compute side. + image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml index c79cebc4c..1b95cd936 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -25,7 +25,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" # Use the container-bundled dynamo (skip srtctl pip install). Same diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 86319edc0..c35fe4ec0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -38,7 +38,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4" model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" # Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4- diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fb50c6f28..b3855391b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1891,7 +1891,7 @@ - dsv4-fp4-gb300-dynamo-sglang description: - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell" + - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)" - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer" - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169 diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 82cb8d35e..cfa9ac6f1 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -38,19 +38,14 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). -SQUASH_DIR="/mnt/vast/squash" +# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / +# pyxis rejects '+' in image paths with "Invalid image format", and the +# old /mnt/vast/squash dir contains '+'-separated files from prior runs. +SQUASH_DIR="/mnt/vast/squash_dupe" mkdir -p "$SQUASH_DIR" SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -# Some images were imported with '+' separators (enroot's default) rather -# than '_'. Check for the '+' variant and symlink so both names resolve. -SQUASH_FILE_PLUS="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh" -if [ ! -f "$SQUASH_FILE" ] && [ -f "$SQUASH_FILE_PLUS" ]; then - ln -sf "$SQUASH_FILE_PLUS" "$SQUASH_FILE" - echo "[squash] symlinked $SQUASH_FILE -> $SQUASH_FILE_PLUS" -fi - enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE From 3882a553fe7f19ffc7f4c9d39c122065383b29f8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 17:32:14 -0700 Subject: [PATCH 10/16] =?UTF-8?q?pin=20dynamo=20to=201.2.0.dev20260426=20?= =?UTF-8?q?=E2=80=94=20first=20arm64=20wheel=20with=20DSv4=20formatter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 10 ++++-- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 35 ++++++++++--------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml index 1b95cd936..d94fd569b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -28,10 +28,14 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" -# Use the container-bundled dynamo (skip srtctl pip install). Same -# rationale as the TP4 sibling - see its header for the casualty list. +# Pin a dynamo dev wheel containing the DSv4 formatter (hash 6a159fed, +# 2026-04-23). See the TP4 sibling header for the full rationale and +# casualty list — the lmsysorg sglang arm64 container ships no +# ai-dynamo, so install: false dies with ModuleNotFoundError, and any +# stable version <=1.0.2 lacks the formatter and 404s. dynamo: - install: false + install: true + version: "1.2.0.dev20260426" sbatch_directives: segment: "2" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index c35fe4ec0..aa00c9f8a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -41,24 +41,27 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" -# Use the container-bundled dynamo. The lmsysorg/sglang:deepseek-v4- -# grace-blackwell image ships a dynamo build with the native Rust -# DeepSeekV4Formatter (added at hash 6a159fed, 2026-04-23 — see comment -# in the gb200 vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). That -# formatter auto-detects DSv4 by model name and serves /v1/completions -# without needing chat_template in tokenizer_config.json. +# Pin a dynamo dev wheel that contains the native Rust DeepSeekV4Formatter +# (added at hash 6a159fed on 2026-04-23 — see the comment in the gb200 +# vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). The 2026-04-26 dev wheel +# from pypi.nvidia.com is the first wheel post that hash with both +# ai-dynamo and ai-dynamo-runtime aarch64 builds. Without the formatter, +# the dynamo frontend rejects DSv4 at pipeline build time with +# "chat_template field is required in the tokenizer_config.json file" +# and 404s every request — that's what runs #24963242956 and the +# follow-up hit on stable 0.8.x. # -# `install: false` is critical here — without it, srtctl's schema -# default (install: True, version: "0.8.0", see srt-slurm/src/srtctl/ -# core/schema.py:697) pip-installs ai-dynamo==0.8.0 from PyPI on top -# of the container, which predates the DSv4 formatter. The frontend -# then 404s on every request: PromptFormatter.from_mdc rejects the -# model at pipeline build time with "chat_template field is required -# in the tokenizer_config.json file". Casualties: runs #24963242956 -# (had dynamo.version: 0.8.1 explicit) and the follow-up (no dynamo -# block, fell through to the 0.8.0 default). +# The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container does +# NOT bundle ai-dynamo, so install: false is wrong here (gives +# "ModuleNotFoundError: No module named 'dynamo'" the moment srtctl +# tries to launch python3 -m dynamo.sglang). The gb200 vllm sibling +# solves the same gap with hash + install: true + a setup_script that +# pulls a prebuilt wheel from /mnt/vast/dynamo_cache; we don't have that +# cache yet for SGLang, so we just let srtctl pip-install the dev wheel +# per rank from pypi.nvidia.com — same payload, slower per-rank install. dynamo: - install: false + install: true + version: "1.2.0.dev20260426" # Pin both nodes (1P + 1D) to the same rack on cw. Without this they # can land on different racks and pay the cross-rack hop on every NIXL From 77bbcb8a4f552e4ceed15c1b62244e5ccdddd6e9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 18:10:21 -0700 Subject: [PATCH 11/16] =?UTF-8?q?step=20back=20to=20dynamo=20dev20260425?= =?UTF-8?q?=20=E2=80=94=20earlier=20wheel=20may=20align=20with=20container?= =?UTF-8?q?'s=20bundled=20sglang?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 2 +- .../sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml index d94fd569b..6e82557ae 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -35,7 +35,7 @@ model: # stable version <=1.0.2 lacks the formatter and 404s. dynamo: install: true - version: "1.2.0.dev20260426" + version: "1.2.0.dev20260425" sbatch_directives: segment: "2" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index aa00c9f8a..68ea73080 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -61,7 +61,7 @@ model: # per rank from pypi.nvidia.com — same payload, slower per-rank install. dynamo: install: true - version: "1.2.0.dev20260426" + version: "1.2.0.dev20260425" # Pin both nodes (1P + 1D) to the same rack on cw. Without this they # can land on different racks and pay the cross-rack hop on every NIXL From d7dc646431b2e8138021985ddfefd9c1f4c3c3b8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 18:46:45 -0700 Subject: [PATCH 12/16] =?UTF-8?q?prebuild=20dynamo=20wheel=20from=20hash?= =?UTF-8?q?=206a159fed=20on=20/mnt/vast=20=E2=80=94=20mirror=20PR=20#1150?= =?UTF-8?q?=20vllm=20pattern=20for=20sglang?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 20 +++-- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 42 ++++----- runners/gb300-cw-sglang-container-deps.sh | 44 ++++++++++ runners/launch_gb300-cw.sh | 86 +++++++++++++++++++ 4 files changed, 165 insertions(+), 27 deletions(-) create mode 100755 runners/gb300-cw-sglang-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml index 6e82557ae..4f514d394 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -28,14 +28,20 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" -# Pin a dynamo dev wheel containing the DSv4 formatter (hash 6a159fed, -# 2026-04-23). See the TP4 sibling header for the full rationale and -# casualty list — the lmsysorg sglang arm64 container ships no -# ai-dynamo, so install: false dies with ModuleNotFoundError, and any -# stable version <=1.0.2 lacks the formatter and 404s. +# Build dynamo from hash 6a159fed via prebuild cache. See the TP4 +# sibling header for the full rationale and the casualty timeline — +# short version: arm64 container ships no ai-dynamo, dev wheels API- +# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror +# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall +# from /mnt/vast/dynamo_cache/ per rank. dynamo: - install: true - version: "1.2.0.dev20260425" + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" sbatch_directives: segment: "2" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 68ea73080..86b262cfc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -41,27 +41,29 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" precision: "fp4" -# Pin a dynamo dev wheel that contains the native Rust DeepSeekV4Formatter -# (added at hash 6a159fed on 2026-04-23 — see the comment in the gb200 -# vllm sibling disagg-gb200-7p1d-dep8-dep16.yaml). The 2026-04-26 dev wheel -# from pypi.nvidia.com is the first wheel post that hash with both -# ai-dynamo and ai-dynamo-runtime aarch64 builds. Without the formatter, -# the dynamo frontend rejects DSv4 at pipeline build time with -# "chat_template field is required in the tokenizer_config.json file" -# and 404s every request — that's what runs #24963242956 and the -# follow-up hit on stable 0.8.x. -# -# The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container does -# NOT bundle ai-dynamo, so install: false is wrong here (gives -# "ModuleNotFoundError: No module named 'dynamo'" the moment srtctl -# tries to launch python3 -m dynamo.sglang). The gb200 vllm sibling -# solves the same gap with hash + install: true + a setup_script that -# pulls a prebuilt wheel from /mnt/vast/dynamo_cache; we don't have that -# cache yet for SGLang, so we just let srtctl pip-install the dev wheel -# per rank from pypi.nvidia.com — same payload, slower per-rank install. +# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling +# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace- +# blackwell_arm64 image lacks both a working ai-dynamo and the rust +# toolchain for an in-container build; pinning a published dev wheel +# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat +# shim warns then disagg startup warmup hangs). Same prebuild-cache +# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel +# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/, +# and the setup_script below force-reinstalls from cache per rank +# (~30 s, no per-rank rust build, no API drift). dynamo: - install: true - version: "1.2.0.dev20260425" + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + # install: false → srtctl skips its own pip install; setup_script is + # the sole installer. + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" # Pin both nodes (1P + 1D) to the same rack on cw. Without this they # can land on different racks and pay the cross-rack hop on every NIXL diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh new file mode 100755 index 000000000..e25362cd5 --- /dev/null +++ b/runners/gb300-cw-sglang-container-deps.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Custom container-deps installer for gb300-cw + sglang. pip-installs +# dynamo from a wheel + source archive that launch_gb300-cw.sh pre-built +# on /mnt/vast BEFORE submitting sbatch. +# +# Why the prebuild design (mirrors the vllm sibling at +# gb300-cw-vllm-container-deps.sh from PR #1150): +# srt-slurm's per-rank install path runs `maturin build` inside every +# container srtctl srun's. The lmsysorg/sglang:deepseek-v4-grace- +# blackwell_arm64 image lacks rust pre-installed, so the per-rank +# build path can't run; pinning a published dev wheel (1.2.0.dev*) +# trips API drift against the bundled sglang 0.5.9 (compat shim +# warning + disagg startup warmup hang — see runs ending 2026-04-27). +# Building dynamo ONCE from hash 6a159fed (the same commit the gb200 +# vllm recipe pins, known to be sglang-API-stable) on a single-node +# srun in launch_gb300-cw.sh sidesteps both: every rank pip-installs +# from the cache here (~30 s, no contention). +# +# Used in tandem with `dynamo.install: false` in the gb300-cw sglang +# recipes so srt-slurm's hardcoded install path is skipped and this +# script is the sole installer. + +set -e + +DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}" +CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH" +DONE_MARKER="$CACHE_DIR/.done" + +if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2 + echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2 + exit 1 +fi + +echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR" +pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall + +rm -rf /tmp/dynamo_build +mkdir -p /tmp/dynamo_build/dynamo +tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo +cd /tmp/dynamo_build/dynamo +pip install --break-system-packages -e . + +echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index cfa9ac6f1..b03dc6dd9 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -49,6 +49,84 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting +# the main sbatch. The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 +# image lacks a working ai-dynamo (install: false → ModuleNotFoundError), +# and pinning a published dev wheel (1.2.0.dev*) trips API drift against +# the bundled sglang 0.5.9 (compat shim warns then disagg startup warmup +# hangs — see runs ending 2026-04-27). Building from hash 6a159fed (the +# same commit the gb200 vllm sibling pins, known sglang-API-stable) on +# a single dedicated srun eliminates per-rank coordination on /mnt/vast +# (NFS flock is unreliable). Same pattern as PR #1150's vllm launcher. +DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b" +DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache" +DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH" +DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done" +mkdir -p "$DYNAMO_CACHE_ROOT" + +if [ ! -f "$DYNAMO_DONE_MARKER" ]; then + echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..." + # Build into a unique temp dir, then atomically mv into place. Two + # concurrent runners may both build; the first to finish the rename + # wins, the loser cleans up. Same-directory rename() is atomic on + # NFS (unlike flock). + TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX") + # --mem=0: claim full node memory. Default cgroup is much smaller and + # rustc's link phase can OOM otherwise. CARGO_BUILD_JOBS=8 caps + # parallelism so peak rustc memory stays bounded on a 72-core Grace + # node, and `-C debuginfo=0` cuts per-process memory further. + srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \ + --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \ + --job-name="${RUNNER_NAME}-prebuild" \ + --container-image="$SQUASH_FILE" \ + --no-container-entrypoint --no-container-mount-home \ + --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \ + bash -c " + set -e + apt-get update -qq + apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + . \$HOME/.cargo/env + fi + if ! command -v maturin &>/dev/null; then + pip install --break-system-packages maturin + fi + rm -rf /tmp/dynamo_build + mkdir -p /tmp/dynamo_build + cd /tmp/dynamo_build + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout $DYNAMO_HASH + cd lib/bindings/python/ + export CARGO_BUILD_JOBS=8 + export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable' + maturin build -o '$TEMP_BUILD' + cd /tmp/dynamo_build/dynamo + tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \ + --exclude='lib/bindings/python/target' \ + --exclude='.git' \ + . + touch '$TEMP_BUILD/.done' + " + if [ -f "$TEMP_BUILD/.done" ]; then + # Atomic publish. If another runner already published, mv fails + # and we just discard our copy. + if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then + echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR" + else + echo "[dynamo-prebuild] another runner published first, discarding our copy" + rm -rf "$TEMP_BUILD" + fi + else + echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2 + rm -rf "$TEMP_BUILD" + exit 1 + fi +else + echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR" +fi + export EVAL_ONLY="${EVAL_ONLY:-false}" export ISL="$ISL" @@ -81,6 +159,14 @@ git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +# Drop our cache-installer setup_script next to upstream's configs. +# Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh` +# alongside `dynamo.install: false` so srtctl skips its own pip install +# and this script (force-reinstalling from /mnt/vast/dynamo_cache) is the +# sole installer per rank. +cp "$GITHUB_WORKSPACE/runners/gb300-cw-sglang-container-deps.sh" configs/gb300-cw-sglang-container-deps.sh +chmod +x configs/gb300-cw-sglang-container-deps.sh + echo "Installing srtctl..." # CRITICAL — uv install location. # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is From 5e3340c835cd7765355daf5f761976a0783017bb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 20:20:31 -0700 Subject: [PATCH 13/16] =?UTF-8?q?switch=20disagg=20transport=20nixl=20?= =?UTF-8?q?=E2=86=92=20mooncake?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 16 ++++--- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 45 ++++++++++--------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml index 4f514d394..b30f5b4d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml @@ -3,7 +3,7 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4" # DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology: # Prefill: 1 node, TP=4 (no DP-attn, no EP). # Decode: 1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit). -# Both on a single GB300 (4 GPUs / node). KV transfer over NIXL. +# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**. # # Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream # PR #75 exactly; this DEP4 variant is a local extension to probe whether @@ -13,12 +13,14 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4" # Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks # see DP=4 replication. SGLang's --disaggregation-decode-tp and # --disaggregation-decode-dp flags on the prefill engine carry this -# metadata so KV chunks route to the correct decode rank during NIXL +# metadata so KV chunks route to the correct decode rank during the # transfer (server_args.py:643-654, validate_disagg_tp_size). # -# Same NIXL state-buffer-transfer caveat as the TP4 sibling - see -# disagg-gb300-1p1d-tp4.yaml header. The grace-blackwell image build -# carries the patch. +# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling. +# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container +# regardless of dynamo version (run 24973148979 with hash 6a159fed + +# prebuild cache still hit the same watchdog timeout). PR #75 calls +# out Mooncake as the working transport for state buffers. # # Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same # way as the symmetric sibling. @@ -103,7 +105,7 @@ backend: trust-remote-code: true tensor-parallel-size: 4 disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so # KV chunks route to the correct decode rank during NIXL transfer. disaggregation-decode-tp: 4 @@ -129,7 +131,7 @@ backend: moe-a2a-backend: deepep deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake moe-runner-backend: "flashinfer_mxfp4" disable-flashinfer-autotune: true mem-fraction-static: 0.90 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml index 86b262cfc..928f387f3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml @@ -7,27 +7,28 @@ name: "dsv4-sglang-disagg-gb300-1p1d-tp4" # # Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at # 1P1D-TP4 fits trivially within a single rack; the explicit segment -# below pins them so the NIXL KV transfer between prefill and decode -# stays rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto -# segment so each recipe owns its own value.) +# below pins them so the KV transfer between prefill and decode stays +# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment +# so each recipe owns its own value.) # # Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300 -# (4 GPUs / node). KV transfer over NIXL. PR #75 measures saturation -# at conc=128 / 838 Total TPS/GPU; sweep capped accordingly. +# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched +# below — see "Transport: Mooncake"). PR #75 measures saturation at +# conc=128 / 838 Total TPS/GPU; sweep capped accordingly. # -# ⚠️ NIXL state-buffer-transfer accuracy bug (upstream PR #75 body): -# the SGLang NIXL backend currently registers and transfers the KV cache -# correctly but DROPS the model's auxiliary state buffer (SWA / NSA / -# Mamba). On DSv4-Pro this collapses GSM8K from 1.000 (agg) to ~0.13 -# (disagg) while throughput numbers and KV byte hashes look healthy. -# Mooncake handles state buffers correctly; the NIXL fix mirrors that -# (~237 lines extending KVArgsRegisterInfo/TransferInfo/register_buffer_ -# to_engine + adding send_state in -# python/sglang/srt/disaggregation/nixl/conn.py). Until the upstream -# sglang fix lands, the patch must be picked up via the -# lmsysorg/sglang:deepseek-v4-grace-blackwell container build. If -# eval-only GSM8K runs come back near 0.13 with healthy throughput, -# that's the cause — not a tuning issue. +# Transport: Mooncake (not NIXL). +# * NIXL hung the prefill startup warmup indefinitely on this stack +# (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the +# DSv4 formatter — compat shim warns on every worker, then a +# 4-token warmup probe never runs forward). See runs through +# 2026-04-27 ~02:35 (gh actions 24973148979) for the exact +# watchdog trace. +# * PR #75 explicitly notes "Mooncake handles state buffers +# correctly" — the disagg accuracy bug it warns about is NIXL- +# specific, and switching to Mooncake side-steps both that bug +# and our warmup hang. +# * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container +# ships the Mooncake transport built-in; no extra deps needed. # # Local deltas vs upstream PR #75: # * benchmark.type = sa-bench (upstream also uses sa-bench in the @@ -66,8 +67,8 @@ extra_mount: - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" # Pin both nodes (1P + 1D) to the same rack on cw. Without this they -# can land on different racks and pay the cross-rack hop on every NIXL -# KV transfer. +# can land on different racks and pay the cross-rack hop on every KV +# transfer. sbatch_directives: segment: "2" # Use all node memory; cw default is too tight for the MXFP4 worker. @@ -120,7 +121,7 @@ backend: trust-remote-code: true tensor-parallel-size: 4 disaggregation-mode: "prefill" - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake moe-runner-backend: "flashinfer_mxfp4" disable-flashinfer-autotune: true mem-fraction-static: 0.90 @@ -135,7 +136,7 @@ backend: trust-remote-code: true tensor-parallel-size: 4 disaggregation-mode: "decode" - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake moe-runner-backend: "flashinfer_mxfp4" disable-flashinfer-autotune: true mem-fraction-static: 0.90 From 83867ea50b07735b304389c1a9056ea2011cbbce Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 22:31:05 -0700 Subject: [PATCH 14/16] =?UTF-8?q?strip=20return=5Frouted=5Fexperts=20kwarg?= =?UTF-8?q?=20from=20dynamo=20call=20sites=20=E2=80=94=20sglang=200.5.9=20?= =?UTF-8?q?Engine.async=5Fgenerate=20doesn't=20accept=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runners/gb300-cw-sglang-container-deps.sh | 48 +++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh index e25362cd5..c8cb84ad0 100755 --- a/runners/gb300-cw-sglang-container-deps.sh +++ b/runners/gb300-cw-sglang-container-deps.sh @@ -42,3 +42,51 @@ cd /tmp/dynamo_build/dynamo pip install --break-system-packages -e . echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" + +# --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 -------------------------- +# ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls +# `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9 +# bundled in lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 has an +# Engine.async_generate signature that doesn't accept that kwarg, so every +# request 500s with: +# TypeError: Engine.async_generate() got an unexpected keyword argument +# 'return_routed_experts' +# (See run 24973148979 → mooncake unblocked the disagg warmup; this is the +# next failure layer.) Strip the kwarg from every call site in the +# extracted dynamo source. `pip install -e .` above is editable, so the +# patch propagates immediately at next `python3 -m dynamo.sglang ...`. +DYNAMO_SRC=/tmp/dynamo_build/dynamo +patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true) +if [ -n "$patch_targets" ]; then + for f in $patch_targets; do + echo "[dynamo-patch] stripping return_routed_experts kwarg in $f" + # Match `return_routed_experts=,?` where is anything + # up to the next `,` or `)` at the same paren depth. Single-line + # case covers >99% of call sites; the value can be False/True/a + # var name. Trailing comma + whitespace is consumed too so we + # don't leave a stray `, )` behind. + python3 - "$f" <<'PYEOF' +import re, sys +path = sys.argv[1] +with open(path) as fh: + src = fh.read() +# Greedy on whitespace, non-greedy on the value (no commas/parens inside). +new = re.sub( + r'return_routed_experts\s*=\s*[^,)]+\s*,?\s*', + '', + src, +) +if new != src: + with open(path, 'w') as fh: + fh.write(new) +PYEOF + done + echo "[dynamo-patch] verifying no return_routed_experts call sites remain..." + if grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null; then + echo "[dynamo-patch] WARNING: residual matches above (likely defaults / declarations, not call sites). Inspect if 500s persist." + else + echo "[dynamo-patch] clean" + fi +else + echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)" +fi From 3efc208cbaba0e8b91eb66fa9ccc0c560120ecac Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 23:20:06 -0700 Subject: [PATCH 15/16] fix dynamo regex: only match whole-line kwarg passes, leave assignment intact --- runners/gb300-cw-sglang-container-deps.sh | 38 ++++++++++++++--------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh index c8cb84ad0..fb6e6b6f8 100755 --- a/runners/gb300-cw-sglang-container-deps.sh +++ b/runners/gb300-cw-sglang-container-deps.sh @@ -58,35 +58,45 @@ echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" DYNAMO_SRC=/tmp/dynamo_build/dynamo patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true) if [ -n "$patch_targets" ]; then + # Match WHOLE LINES that are just a kwarg pass: + # return_routed_experts=,? + # The value is constrained to a simple identifier ([A-Za-z_][\w.]*), + # which deliberately excludes function calls (no `(` allowed). This + # leaves the multi-line assignment statement at decode_handler.py:275 + # intact: + # return_routed_experts = getattr( + # self.config.server_args, "enable_return_routed_experts", False + # ) + # That assignment is dead code after we strip the kwarg passes, but + # leaving it costs nothing and avoids the syntax-error trap from the + # earlier (over-greedy) version of this patch. for f in $patch_targets; do - echo "[dynamo-patch] stripping return_routed_experts kwarg in $f" - # Match `return_routed_experts=,?` where is anything - # up to the next `,` or `)` at the same paren depth. Single-line - # case covers >99% of call sites; the value can be False/True/a - # var name. Trailing comma + whitespace is consumed too so we - # don't leave a stray `, )` behind. + echo "[dynamo-patch] stripping return_routed_experts kwarg lines in $f" python3 - "$f" <<'PYEOF' import re, sys path = sys.argv[1] with open(path) as fh: src = fh.read() -# Greedy on whitespace, non-greedy on the value (no commas/parens inside). +# Whole-line kwarg pass: indented `return_routed_experts=,?` then EOL. +# `[A-Za-z_][\w.]*` matches identifiers, attribute access, True/False/None — but NOT calls. new = re.sub( - r'return_routed_experts\s*=\s*[^,)]+\s*,?\s*', + r'^[ \t]+return_routed_experts\s*=\s*[A-Za-z_][\w.]*\s*,?[ \t]*\n', '', src, + flags=re.MULTILINE, ) if new != src: with open(path, 'w') as fh: fh.write(new) + print(f'[dynamo-patch] patched: {path}') +else: + print(f'[dynamo-patch] no kwarg-pass lines matched in: {path}') PYEOF done - echo "[dynamo-patch] verifying no return_routed_experts call sites remain..." - if grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null; then - echo "[dynamo-patch] WARNING: residual matches above (likely defaults / declarations, not call sites). Inspect if 500s persist." - else - echo "[dynamo-patch] clean" - fi + # Sanity: any remaining occurrence is fine if it's the assignment; + # log it so the next person knows what's left. + echo "[dynamo-patch] residual occurrences (expected: only the dead assignment in decode_handler.py):" + grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || echo " (none)" else echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)" fi From 173bd41dee956a2b48c37e043289c021463649d6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 23:09:49 -0700 Subject: [PATCH 16/16] PR85 --- .github/configs/nvidia-master.yaml | 77 ++++++--- .../1k1k/disagg-1p1d-dep4-mega-moe.yaml | 128 ++++++++++++++ .../1k1k/disagg-1p1d-tp4-mxfp4.yaml | 86 ++++++++++ .../disagg-1p2d-dep4-to-dep8-mega-moe.yaml | 127 ++++++++++++++ .../1k1k/disagg-2p2d-dep8-mega-moe.yaml | 126 ++++++++++++++ .../1k1k/disagg-2p2d-tp8-mxfp4.yaml | 98 +++++++++++ .../1k1k/disagg-gb300-1p1d-tp4-dep4.yaml | 153 ----------------- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 159 ------------------ runners/gb300-cw-sglang-container-deps.sh | 28 +++ runners/launch_gb300-cw.sh | 20 ++- 10 files changed, 660 insertions(+), 342 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cb9c0c675..3f905d3c8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7728,49 +7728,82 @@ dsv4-fp4-gb300-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - # 1P + 1D on a single GB300 (4 GPUs / node), MXFP4 MoE kernels, NIXL - # KV transfer. Recipes staged at benchmarks/multi_node/srt-slurm- - # recipes/sglang/deepseek-v4/1k1k/ and overlaid into the srt-slurm - # checkout by launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave - # (2x 18-node racks); recipes set their own sbatch_directives.segment - # for rack pinning. - # - # Two search-space bands: - # * Symmetric TP4 (low-conc, 1-128): both sides TP=4. Conc 1/2 give - # single-user latency floor; 4-128 covers the saturation curve - # mirroring NVIDIA/srt-slurm PR #75. - # * Asymmetric TP4 / DEP4 (16-1024): prefill TP=4, decode DP-attn + - # DeepEP. Conc 16-128 overlaps the TP4 band for head-to-head - # comparison (find the crossover where DPA beats TP-only); 256- - # 1024 extends past the symmetric saturation point (~conc=128 / - # 838 Total TPS/GPU per PR #75). + # Five disagg topologies from NVIDIA/srt-slurm PR #85 branch + # recipes/dsv4-agg-disagg, overlaid with cw-specific fields by + # launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave (2x 18-node + # racks); recipes set their own sbatch_directives.segment for rack + # pinning. All use NIXL KV transfer. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + # 1P1D TP=4 MXFP4 — low-latency baseline (2 nodes) + - conc-list: [4, 8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml" + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-tp4-mxfp4.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false - - conc-list: [16, 32, 64, 128, 256, 512, 1024] + # 1P1D DEP4 mega_moe — TEP disagg (2 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] prefill: num-worker: 1 tp: 4 - ep: 1 - dp-attn: false + ep: 4 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml" + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-dep4-mega-moe.yaml" decode: num-worker: 1 tp: 4 ep: 4 dp-attn: true + # 1P2D asymmetric DEP4->DEP8 mega_moe — best per-GPU efficiency (3 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p2d-dep4-to-dep8-mega-moe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 2P2D symmetric DEP8 mega_moe — largest throughput (4 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-dep8-mega-moe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 2P2D TP=8 MXFP4 — TP-only 4-node baseline (4 nodes) + - conc-list: [4, 8, 16, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-tp8-mxfp4.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml new file mode 100644 index 000000000..72baef909 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml @@ -0,0 +1,128 @@ +name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml new file mode 100644 index 000000000..a0b60a00b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml @@ -0,0 +1,86 @@ +name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + decode_environment: + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tensor-parallel-size: 4 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + mem-fraction-static: 0.90 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 8192 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml new file mode 100644 index 000000000..569373509 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml @@ -0,0 +1,127 @@ +name: "dsv4-pro-gb300-disagg-1p2d-dep4-to-dep8-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "3" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 4 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 2048 + cuda-graph-max-bs: 2048 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml new file mode 100644 index 000000000..8d82d58cb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml @@ -0,0 +1,126 @@ +name: "dsv4-pro-gb300-disagg-2p2d-dep8-mega-moe-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "4" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + data-parallel-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.83 + max-running-requests: 2048 + cuda-graph-max-bs: 2048 + chunked-prefill-size: 32768 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml new file mode 100644 index 000000000..1b697d826 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml @@ -0,0 +1,98 @@ +name: "dsv4-pro-gb300-disagg-2p2d-tp8-mxfp4-1k1k" + +dynamo: + install: false + +setup_script: gb300-cw-sglang-container-deps.sh + +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + +sbatch_directives: + segment: "4" + mem: "0" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +frontend: + type: dynamo + nginx_container: nginx + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_node: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 8192 + disable-radix-cache: true + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 8192 + disable-radix-cache: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "4x8x16x32x64x128x256x512" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml deleted file mode 100644 index b30f5b4d1..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4-dep4.yaml +++ /dev/null @@ -1,153 +0,0 @@ -name: "dsv4-sglang-disagg-gb300-1p1d-tp4-dep4" - -# DeepSeek-V4-Pro disaggregated on GB300, asymmetric topology: -# Prefill: 1 node, TP=4 (no DP-attn, no EP). -# Decode: 1 node, full DP-attn (TP=4, DP=4) + DeepEP (EP=4 implicit). -# Both on a single GB300 (4 GPUs / node). KV transfer over **Mooncake**. -# -# Sibling of disagg-gb300-1p1d-tp4.yaml. The TP4 sibling mirrors upstream -# PR #75 exactly; this DEP4 variant is a local extension to probe whether -# decode-side DP-attn + DeepEP unlocks throughput past the symmetric -# saturation point (~conc=128 / 838 Total TPS/GPU per PR #75). -# -# Asymmetric KV layout: prefill ranks see TP=4 sharding; decode ranks -# see DP=4 replication. SGLang's --disaggregation-decode-tp and -# --disaggregation-decode-dp flags on the prefill engine carry this -# metadata so KV chunks route to the correct decode rank during the -# transfer (server_args.py:643-654, validate_disagg_tp_size). -# -# Transport: Mooncake (not NIXL) — same rationale as the TP4 sibling. -# NIXL hung the disagg warmup on the lmsysorg sglang 0.5.9 container -# regardless of dynamo version (run 24973148979 with hash 6a159fed + -# prebuild cache still hit the same watchdog timeout). PR #75 calls -# out Mooncake as the working transport for state buffers. -# -# Cluster: gb300-cw (CoreWeave 2x18-node racks); rack-pinned the same -# way as the symmetric sibling. - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" - precision: "fp4" - -# Build dynamo from hash 6a159fed via prebuild cache. See the TP4 -# sibling header for the full rationale and the casualty timeline — -# short version: arm64 container ships no ai-dynamo, dev wheels API- -# drift against sglang 0.5.9 and hang the disagg warmup, so we mirror -# the gb200 vllm sibling's cache pattern (PR #1150) and force-reinstall -# from /mnt/vast/dynamo_cache/ per rank. -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: false - -setup_script: gb300-cw-sglang-container-deps.sh - -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -sbatch_directives: - segment: "2" - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - - decode_environment: - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - # MEGA-MoE / DeepEP envs - only relevant on decode where DP-attn + - # EP is enabled. Mirror gen_launch.py medium/large defaults. - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - tensor-parallel-size: 4 - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - # Decode runs DP-attn TP=4 / DP=4; prefill must be told both so - # KV chunks route to the correct decode rank during NIXL transfer. - disaggregation-decode-tp: 4 - disaggregation-decode-dp: 4 - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - mem-fraction-static: 0.90 - max-running-requests: 128 - cuda-graph-max-bs: 128 - chunked-prefill-size: 8192 - disable-radix-cache: true - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - tensor-parallel-size: 4 - # Full DP-attn on 4 GPUs: each rank is its own DP unit for - # attention; MoE is sharded across EP (ep_size = tp_size = 4 - # implicit when --moe-a2a-backend deepep). - enable-dp-attention: true - dp-size: 4 - moe-a2a-backend: deepep - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - mem-fraction-static: 0.90 - max-running-requests: 512 - cuda-graph-max-bs: 512 - chunked-prefill-size: 8192 - disable-radix-cache: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - # Conc 16-128 overlaps the TP4 sibling for head-to-head comparison - # (where does decode-side DPA start beating TP-only?); 256-1024 - # probes throughput past the symmetric saturation point. - concurrencies: "16x32x64x128x256x512x1024" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml deleted file mode 100644 index 928f387f3..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb300-1p1d-tp4.yaml +++ /dev/null @@ -1,159 +0,0 @@ -name: "dsv4-sglang-disagg-gb300-1p1d-tp4" - -# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — sglang + -# dynamo frontend. Ported from NVIDIA/srt-slurm PR #75 -# (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml). GB300 sibling of -# the dsv4-sglang-disagg-gb200-1p1d-dep8-tep8 recipe in this directory tree. -# -# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). 2 nodes total at -# 1P1D-TP4 fits trivially within a single rack; the explicit segment -# below pins them so the KV transfer between prefill and decode stays -# rack-local. (Cluster's srtslurm.yaml turns off srtctl's auto segment -# so each recipe owns its own value.) -# -# Topology: 1 prefill node + 1 decode node, each TP=4 on a single GB300 -# (4 GPUs / node). KV transfer over **Mooncake** (was NIXL; switched -# below — see "Transport: Mooncake"). PR #75 measures saturation at -# conc=128 / 838 Total TPS/GPU; sweep capped accordingly. -# -# Transport: Mooncake (not NIXL). -# * NIXL hung the prefill startup warmup indefinitely on this stack -# (sglang 0.5.9 in container vs ai-dynamo ≥1.1.0 needed for the -# DSv4 formatter — compat shim warns on every worker, then a -# 4-token warmup probe never runs forward). See runs through -# 2026-04-27 ~02:35 (gh actions 24973148979) for the exact -# watchdog trace. -# * PR #75 explicitly notes "Mooncake handles state buffers -# correctly" — the disagg accuracy bug it warns about is NIXL- -# specific, and switching to Mooncake side-steps both that bug -# and our warmup hang. -# * The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 container -# ships the Mooncake transport built-in; no extra deps needed. -# -# Local deltas vs upstream PR #75: -# * benchmark.type = sa-bench (upstream also uses sa-bench in the -# latest revision; matches). -# * sbatch_directives.segment + mem: rack-pinning for cw, mirroring -# the dynamo-vllm gb300 recipe convention. Upstream targets a -# different cluster and doesn't need this. - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64" - precision: "fp4" - -# Build dynamo from hash 6a159fed (the same commit the gb200 vllm sibling -# pins, known sglang-API-stable). The lmsysorg/sglang:deepseek-v4-grace- -# blackwell_arm64 image lacks both a working ai-dynamo and the rust -# toolchain for an in-container build; pinning a published dev wheel -# (1.2.0.dev*) trips API drift against bundled sglang 0.5.9 (compat -# shim warns then disagg startup warmup hangs). Same prebuild-cache -# pattern as PR #1150 for vllm: launch_gb300-cw.sh builds the wheel -# ONCE on a single-node srun, drops it at /mnt/vast/dynamo_cache/, -# and the setup_script below force-reinstalls from cache per rank -# (~30 s, no per-rank rust build, no API drift). -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # install: false → srtctl skips its own pip install; setup_script is - # the sole installer. - install: false - -setup_script: gb300-cw-sglang-container-deps.sh - -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin both nodes (1P + 1D) to the same rack on cw. Without this they -# can land on different racks and pay the cross-rack hop on every KV -# transfer. -sbatch_directives: - segment: "2" - # Use all node memory; cw default is too tight for the MXFP4 worker. - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - - decode_environment: - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - tensor-parallel-size: 4 - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - mem-fraction-static: 0.90 - max-running-requests: 128 - cuda-graph-max-bs: 128 - chunked-prefill-size: 8192 - disable-radix-cache: true - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - tensor-parallel-size: 4 - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - mem-fraction-static: 0.90 - max-running-requests: 128 - cuda-graph-max-bs: 128 - chunked-prefill-size: 8192 - disable-radix-cache: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - # Low-latency band — TP4 1P1D saturates near conc=128 on GB300 - # (PR #75 verified: 838 Total TPS/GPU at conc=128). Conc 1/2 give - # single-user latency floor reference; 4-128 covers the saturation - # curve. For high-conc Pareto use the DEP variants. - concurrencies: "1x2x4x8x16x32x64x128" - req_rate: "inf" - use_chat_template: false diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh index fb6e6b6f8..348c436ef 100755 --- a/runners/gb300-cw-sglang-container-deps.sh +++ b/runners/gb300-cw-sglang-container-deps.sh @@ -43,6 +43,34 @@ pip install --break-system-packages -e . echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" +# --- NIXL DSv4 state-buffer patch: sglang PR #23773 -------------------------- +# The disagg recipes use NIXL KV transfer. Without this patch, NIXL +# silently drops auxiliary state buffers (SWA / NSA / Mamba), causing +# decode-side accuracy to collapse on DSv4-Pro. The patch mirrors what +# the Mooncake backend already does. See NVIDIA/srt-slurm PR #85 README. +SGLANG_DIR="${SGLANG_DIR:-/sgl-workspace/sglang}" +SGLANG_REMOTE="https://github.com/sgl-project/sglang.git" +SGLANG_PR_NUMBER="23773" +SGLANG_PR_REF="refs/pull/${SGLANG_PR_NUMBER}/head" +SGLANG_LOCAL_BRANCH="nixl-dsv4-pr-${SGLANG_PR_NUMBER}" + +echo "=== Installing SGLang NIXL DSV4 fix from PR #${SGLANG_PR_NUMBER} ===" + +if [ -d "$SGLANG_DIR/.git" ]; then + cd "$SGLANG_DIR" + git config --global --add safe.directory "$SGLANG_DIR" 2>/dev/null || true + if git remote get-url origin >/dev/null 2>&1; then + git remote set-url origin "$SGLANG_REMOTE" + else + git remote add origin "$SGLANG_REMOTE" + fi + git fetch --depth 1 origin "$SGLANG_PR_REF" + git checkout -f -B "$SGLANG_LOCAL_BRANCH" FETCH_HEAD + echo "Checked out SGLang PR #${SGLANG_PR_NUMBER} at $(git rev-parse HEAD)" +else + echo "WARNING: $SGLANG_DIR/.git not found; skipping NIXL patch (container may already include fix)" +fi + # --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 -------------------------- # ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls # `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9 diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index b03dc6dd9..edbb55375 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -14,7 +14,7 @@ if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/. export MODEL_PATH="/mnt/vast/models/dsv4/" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + export SRT_SLURM_MODEL_PREFIX="dsv4-pro" else echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" exit 1 @@ -150,14 +150,16 @@ fi git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout sa-submission-q2-2026 +git checkout recipes/dsv4-agg-disagg -# Overlay our hand-rolled DSv4 SGLang recipes. NVIDIA/srt-slurm has no -# upstream sglang DSv4 disagg recipe yet beyond PR #75's 1P1D-TP4 -# entry, so we ship the recipe locally and copy it in here. `cp -rT` -# overlays onto a possibly-existing upstream stub instead of nesting. -mkdir -p recipes/sglang/deepseek-v4 -cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +# Overlay our cw-adapted DSv4 SGLang disagg recipes onto the upstream +# recipes from PR #85. The upstream recipes at +# recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/ don't carry +# cw-specific fields (dynamo.install, setup_script, extra_mount, +# sbatch_directives), so we overlay locally-maintained copies that add +# those. `cp -rT` replaces the upstream files in place. +mkdir -p recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k" recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp # Drop our cache-installer setup_script next to upstream's configs. # Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh` @@ -223,7 +225,9 @@ model_paths: containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} + dsv4-grace-blackwell: ${SQUASH_FILE} "${IMAGE}": ${SQUASH_FILE} + nginx: ${NGINX_SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} # Auto-emission of #SBATCH --segment={total_nodes} is turned off here # because each gb300 recipe sets its own segment via sbatch_directives