SemiAnalysisAI · Oseltamivir · Apr 24, 2026 · Apr 24, 2026 · claude · Apr 24, 2026
@@ -7428,3 +7428,53 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+# DeepSeek-V4-Pro on GB200, SGLang aggregated (TP=8 across 2 nodes).
+# Recipes live in YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm
+# PR #69), derived from the official SGLang DeepSeek-V4 cookbook.
+# `framework: sglang` (no Dynamo frontend) tells the runner to clone that
+# fork instead of NVIDIA/srt-slurm and to use the recipe directly.
+dsv4-fp4-gb200-sglang:
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: sglang
+  multinode: true
+  disagg: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Low-latency: TP=8 + EAGLE 3/4 speculative decoding (smaller batches,
+    # better TPOT). Recipe targets the low-conc end of the curve.
+    - conc-list: [1, 2, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/pull/69/files#diff-recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml
+        - "CONFIG_FILE=recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Throughput: TP=8 with no MTP (matches cookbook's "throughput" tier).
+    - conc-list: [128, 256, 512, 1024]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/pull/69/files#diff-recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml
+        - "CONFIG_FILE=recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1,3 +1,13 @@
+- config-keys:
+    - dsv4-fp4-gb200-sglang
+  description:
+    - "Add DeepSeek-V4-Pro SGLang aggregated GB200 benchmarks (1k/1k, TP=8, 2 nodes)"
+    - "Recipes from YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm PR #69)"
+    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell"
+    - "Two recipes: agg-2n-low-latency (EAGLE 3/4 spec decoding) for conc 1-64, agg-2n-nomtp for conc 128-1024"
+    - "Runner script clones the YAMY1234 fork pinned at commit da535e87 instead of NVIDIA/srt-slurm"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -46,6 +46,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4"
         exit 1
     fi
+elif [[ $FRAMEWORK == "sglang" ]]; then
+    # Direct SGLang aggregated serving (no Dynamo frontend), used by recipes
+    # in YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm PR #69).
+    if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+        export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro"
+        export SRT_SLURM_MODEL_PREFIX="dsv4-pro"
+    else
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for sglang: dsv4/fp4"
+        exit 1
+    fi
 else
     export MODEL_PATH=$MODEL
 fi
@@ -134,7 +144,22 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+if [[ $FRAMEWORK == "sglang" && $MODEL_PREFIX == "dsv4" ]]; then
+    # YAMY1234's fork of NVIDIA/srt-slurm, branch dsv4-pro-recipes
+    # (https://github.com/NVIDIA/srt-slurm/pull/69) — adds DeepSeek-V4-Pro
+    # SGLang aggregated recipes for GB200 / GB300 derived from the SGLang
+    # DeepSeek-V4 cookbook. Pinned to the PR head commit for reproducibility.
+    git clone https://github.com/YAMY1234/srt-slurm-nv.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout da535e87338cfac0388fc301f9c87b7bc5e669a6
+    # The upstream recipes hardcode slurm.partition to NVIDIA's internal
+    # partition names (gb200 / gb300). Rewrite to our partition so sbatch
+    # doesn't fail with "invalid partition specified".
+    find recipes/gb200-fp4 recipes/gb300-fp4 -type f -name "*.yaml" -exec \
+        sed -i "s/^  partition: gb200$/  partition: ${SLURM_PARTITION}/" {} +
+    find recipes/gb200-fp4 recipes/gb300-fp4 -type f -name "*.yaml" -exec \
+        sed -i "s/^  partition: gb300$/  partition: ${SLURM_PARTITION}/" {} +
+elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
@@ -187,6 +212,7 @@ model_paths:
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
   dynamo-sglang: ${SQUASH_FILE}
+  dsv4-grace-blackwell: ${SQUASH_FILE}
   "${IMAGE}": ${SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
 EOF