diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ec9cbc11e..f2d33e091 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7428,3 +7428,53 @@ kimik2.5-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +# DeepSeek-V4-Pro on GB200, SGLang aggregated (TP=8 across 2 nodes). +# Recipes live in YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm +# PR #69), derived from the official SGLang DeepSeek-V4 cookbook. +# `framework: sglang` (no Dynamo frontend) tells the runner to clone that +# fork instead of NVIDIA/srt-slurm and to use the recipe directly. +dsv4-fp4-gb200-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: sglang + multinode: true + disagg: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low-latency: TP=8 + EAGLE 3/4 speculative decoding (smaller batches, + # better TPOT). Recipe targets the low-conc end of the curve. + - conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/pull/69/files#diff-recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Throughput: TP=8 with no MTP (matches cookbook's "throughput" tier). + - conc-list: [128, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/pull/69/files#diff-recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml + - "CONFIG_FILE=recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..14b0067e2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,13 @@ +- config-keys: + - dsv4-fp4-gb200-sglang + description: + - "Add DeepSeek-V4-Pro SGLang aggregated GB200 benchmarks (1k/1k, TP=8, 2 nodes)" + - "Recipes from YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm PR #69)" + - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell" + - "Two recipes: agg-2n-low-latency (EAGLE 3/4 spec decoding) for conc 1-64, agg-2n-nomtp for conc 128-1024" + - "Runner script clones the YAMY1234 fork pinned at commit da535e87 instead of NVIDIA/srt-slurm" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index b746e4a24..11d5a424b 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -46,6 +46,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4" exit 1 fi +elif [[ $FRAMEWORK == "sglang" ]]; then + # Direct SGLang aggregated serving (no Dynamo frontend), used by recipes + # in YAMY1234/srt-slurm-nv:dsv4-pro-recipes (NVIDIA srt-slurm PR #69). + if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro" + export SRT_SLURM_MODEL_PREFIX="dsv4-pro" + else + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for sglang: dsv4/fp4" + exit 1 + fi else export MODEL_PATH=$MODEL fi @@ -134,7 +144,22 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -if [[ $FRAMEWORK == "dynamo-vllm" ]]; then +if [[ $FRAMEWORK == "sglang" && $MODEL_PREFIX == "dsv4" ]]; then + # YAMY1234's fork of NVIDIA/srt-slurm, branch dsv4-pro-recipes + # (https://github.com/NVIDIA/srt-slurm/pull/69) — adds DeepSeek-V4-Pro + # SGLang aggregated recipes for GB200 / GB300 derived from the SGLang + # DeepSeek-V4 cookbook. Pinned to the PR head commit for reproducibility. + git clone https://github.com/YAMY1234/srt-slurm-nv.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout da535e87338cfac0388fc301f9c87b7bc5e669a6 + # The upstream recipes hardcode slurm.partition to NVIDIA's internal + # partition names (gb200 / gb300). Rewrite to our partition so sbatch + # doesn't fail with "invalid partition specified". + find recipes/gb200-fp4 recipes/gb300-fp4 -type f -name "*.yaml" -exec \ + sed -i "s/^ partition: gb200$/ partition: ${SLURM_PARTITION}/" {} + + find recipes/gb200-fp4 recipes/gb300-fp4 -type f -name "*.yaml" -exec \ + sed -i "s/^ partition: gb300$/ partition: ${SLURM_PARTITION}/" {} + +elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 @@ -187,6 +212,7 @@ model_paths: containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} + dsv4-grace-blackwell: ${SQUASH_FILE} "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} EOF