Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/configs/srt-slurm-validation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Minimal config for validating NVIDIA/srt-slurm#41 end-to-end on GB200.
# Referenced explicitly via --config-files on a one-off workflow_dispatch;
# NOT picked up by normal sweeps (they run against nvidia-master.yaml).
# Uses dsr1-fp4-dynamo-trt, which the launcher maps to
# /mnt/lustre01/models/deepseek-r1-0528-fp4-v2/ (present on both the GH
# runner and compute nodes, so preflight passes). Smallest 8k1k trtllm
# recipe at 1P+8D (no zip_override, nginx-sqsh alias matches launcher).
dsr1-fp4-gb200-dynamo-trt:
image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
model: nvidia/DeepSeek-R1-0528-NVFP4-v2
model-prefix: dsr1
runner: gb200
precision: fp4
framework: dynamo-trt
multinode: true
disagg: true
seq-len-configs:
- isl: 8192
osl: 1024
search-space:
- conc-list: [24]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/main/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
15 changes: 15 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ on:
description: "Git ref (branch/sha) to checkout"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""

env:
RANDOM_RANGE_RATIO: 0.8
Expand Down Expand Up @@ -126,6 +136,11 @@ env:
DECODE_EP: ${{ inputs.decode-ep }}
DECODE_DP_ATTN: ${{ inputs.decode-dp-attn }}

# Optional override for which srt-slurm repo/ref the launcher clones.
# Leave empty to use the launcher's built-in defaults per framework.
SRT_SLURM_REPO: ${{ inputs.srt-slurm-repo }}
SRT_SLURM_REF: ${{ inputs.srt-slurm-ref }}

permissions:
contents: read

Expand Down
24 changes: 24 additions & 0 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ on:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""
workflow_call:
inputs:
generate-cli-command:
Expand All @@ -30,6 +40,16 @@ on:
description: "Ref (branch/sha) to checkout for generating configs"
required: false
type: string
srt-slurm-repo:
description: "Override srt-slurm clone URL (leave empty to use launcher default)"
required: false
type: string
default: ""
srt-slurm-ref:
description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)"
required: false
type: string
default: ""

jobs:
get-jobs:
Expand Down Expand Up @@ -102,6 +122,8 @@ jobs:
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
run-eval: false
ref: ${{ inputs.ref }}
srt-slurm-repo: ${{ inputs.srt-slurm-repo }}
srt-slurm-ref: ${{ inputs.srt-slurm-ref }}

test-sweep-multi-node-evals:
needs: get-jobs
Expand Down Expand Up @@ -143,6 +165,8 @@ jobs:
eval-only: true
eval-conc: ${{ matrix.config.eval-conc }}
ref: ${{ inputs.ref }}
srt-slurm-repo: ${{ inputs.srt-slurm-repo }}
srt-slurm-ref: ${{ inputs.srt-slurm-ref }}

test-sweep-single-node:
needs: get-jobs
Expand Down
30 changes: 20 additions & 10 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,27 @@ if [ -d "$SRT_REPO_DIR" ]; then
rm -rf "$SRT_REPO_DIR"
fi

# Allow SRT_SLURM_REPO / SRT_SLURM_REF to override the default clone source
# (useful for testing WIP branches like the generalized lm-eval-main).
if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q2-2026"
elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q2-2026
DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q2-2026"
else
git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout sa-submission-q1-2026
DEFAULT_SRT_REPO="https://github.com/ishandhanani/srt-slurm.git"
DEFAULT_SRT_REF="sa-submission-q1-2026"
fi

SRT_SLURM_REPO="${SRT_SLURM_REPO:-$DEFAULT_SRT_REPO}"
SRT_SLURM_REF="${SRT_SLURM_REF:-$DEFAULT_SRT_REF}"

echo "Cloning ${SRT_SLURM_REPO} @ ${SRT_SLURM_REF}"
git clone "$SRT_SLURM_REPO" "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
git checkout "$SRT_SLURM_REF"

echo "Installing srtctl..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
Expand Down Expand Up @@ -197,7 +204,10 @@ cat srtslurm.yaml
echo "Running make setup..."
make setup ARCH=aarch64

# Export eval-related env vars for srt-slurm post-benchmark eval
# Export eval-related env vars for srt-slurm post-benchmark eval.
# LM_EVAL_WORKSPACE is what the generalized srt-slurm reads; INFMAX_WORKSPACE
# is kept for compatibility with older srt-slurm branches (sa-submission-*).
export LM_EVAL_WORKSPACE="$GITHUB_WORKSPACE"
export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"

echo "Submitting job with srtctl..."
Expand Down
Loading