Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5477ee4
[AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X
chunfangamd Mar 11, 2026
326b31d
[AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support
chunfangamd Mar 11, 2026
7d9eb51
[AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API
chunfangamd Mar 11, 2026
02a547c
[AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster)
chunfangamd Mar 12, 2026
ab656e1
[AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0
chunfangamd Mar 12, 2026
6bb39b4
[AMD] Use public vLLM base image with runtime dependency install
chunfangamd Mar 12, 2026
b4dad14
[AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg d…
chunfangamd Mar 13, 2026
d2b9332
[AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware p…
chunfangamd Mar 13, 2026
159b571
[AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh
ichbinblau Mar 17, 2026
d7e9e50
Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but…
ichbinblau Mar 17, 2026
fe12e82
[AMD] Fix vLLM disagg hang: READ mode support + safety timeouts
chunfangamd Mar 19, 2026
3e05159
Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs)
chunfangamd Mar 21, 2026
4d1a315
[AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client ha…
chunfangamd Mar 22, 2026
0006d60
[AMD] Fix vLLM disagg Slurm job never terminating after benchmark com…
chunfangamd Mar 22, 2026
a002143
[AMD] Enable MoRI-IO READ mode by default for vLLM disagg
chunfangamd Mar 22, 2026
eba7f66
[AMD] Fix CI checkout failure caused by root-owned __pycache__ files
chunfangamd Mar 22, 2026
8c01e38
[AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS
chunfangamd Mar 23, 2026
7f03362
[AMD] Fix KV reaper deadlock on high-ISL disagg workloads
chunfangamd Mar 23, 2026
5fedd82
[AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,…
ichbinblau Mar 24, 2026
708570b
[AMD] Upgrade vLLM disagg image from v0.17.1 to v0.18.0
chunfangamd Mar 29, 2026
96154d2
[AMD] Add Kimi-K2.5-MXFP4 disagg inference config (1P2D)
chunfangamd Mar 30, 2026
25d1f59
feat: add MiniMax M2.5 PD disaggregation recipe (1P2D, MoRI-EP + MoRI…
chunfangamd Apr 3, 2026
ac24450
feat: add Dockerfile and runtime patch for MiniMax M2.5 WideEP + MoRI
chunfangamd Apr 3, 2026
e553a8c
Fix: rename minimaxm25 to minimaxm2.5 for CI naming consistency
chunfangamd Apr 3, 2026
ebaabd2
Optimize: add --gpu-memory-utilization 0.95 and --block-size 32 to Mi…
chunfangamd Apr 3, 2026
4d82c0f
Fix: MiniMax M2.5 disagg — require EP=8 for prefill, fix ROCm gate dtype
chunfangamd Apr 3, 2026
e163312
Remove unused docker/minimax-m25-disagg/ directory
chunfangamd Apr 3, 2026
185df53
remove vllm disagg for dpsr1 and dpv3
ichbinblau Apr 13, 2026
48cc23a
consolidate amd_utils for sglang and vllm
ichbinblau Apr 21, 2026
5adfe2b
use vLLM router as default router for vllm disagg
ichbinblau Apr 21, 2026
0734709
fix bugs
ichbinblau Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 107 additions & 1 deletion .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,6 @@ dsr1-fp8-mi355x-sglang-disagg:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp8-mi355x-sglang-disagg-mtp:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -993,6 +992,113 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"

kimik2.5-fp4-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:v0.18.0
model: amd/Kimi-K2.5-MXFP4
model-prefix: kimik2.5
runner: mi355x-disagg
precision: fp4
framework: vllm-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

minimaxm2.5-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:v0.18.0
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: mi355x-disagg
precision: fp8
framework: vllm-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
# Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
# TP8 shards to 192 which is not divisible by FP8 block_n=128.
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
Expand Down
72 changes: 54 additions & 18 deletions benchmarks/multi_node/amd_utils/bench.sh
Original file line number Diff line number Diff line change
@@ -1,63 +1,99 @@
#!/bin/bash
# Dual-Engine Disaggregated Benchmark Runner
#
# ENGINE=sglang (default): SGLang benchmark
# ENGINE=vllm: vLLM benchmark
#
# Produces JSON result files via benchmark_serving.py so that the CI pipeline
# can collect and process results.
#
# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
# <model_dir> <model_name> <log_path> <isl> <osl> \
# <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>

ENGINE="${ENGINE:-sglang-disagg}"

n_prefill=$1
n_decode=$2
prefill_gpus=$3
decode_gpus=$4
model_path=$5
model_name=$6
MODEL_PATH="${model_path}/${model_name}"
MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
log_path=$7

chosen_isl=${8:-1024}
chosen_osl=${9:-1024}
concurrency_list=${10:-"512x1"}
chosen_req_rate=${11:-1}
if [[ "$ENGINE" == "vllm" ]]; then
chosen_req_rate=${11:-inf}
else
chosen_req_rate=${11:-1}
fi
random_range_ratio=${12:-0.8}
num_prompts_multiplier=${13:-10}

IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"

echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"

head_node="localhost"
head_port="30000"
ROUTER_PORT="${ROUTER_PORT:-30000}"

echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"

profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
mkdir -p $profile_folder
profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
mkdir -p "$profile_folder"

source "$(dirname "$0")/../../benchmark_lib.sh"

# Repo root inside the container (3 levels up from this script's directory)
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"

for max_concurrency in ${chosen_concurrencies[@]}; do
for max_concurrency in "${chosen_concurrencies[@]}"; do

export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"

num_prompts=$(( max_concurrency * num_prompts_multiplier ))
if [[ "$num_prompts" -lt 16 ]]; then
num_prompts=16
fi

echo "profile_folder: $profile_folder"
echo "max_concurrency: $max_concurrency"
echo "chosen_req_rate: $chosen_req_rate"
echo "MODEL_PATH: $MODEL_PATH"
echo "head_port: $head_port"
echo "ROUTER_PORT: $ROUTER_PORT"
echo "chosen_isl: $chosen_isl"
echo "chosen_osl: $chosen_osl"
echo "num_prompts: $num_prompts"
echo "export_file: $export_file"

# Engine-specific extra flags
extra_flags=""
if [[ "$ENGINE" == "vllm-disagg" ]]; then
extra_flags="--trust-remote-code"
else
if [ "$IS_MTP" = "true" ]; then
extra_flags="--use-chat-template"
fi
fi

run_benchmark_serving \
--bench-serving-dir "$REPO_ROOT" \
--model ${MODEL_PATH} \
--port ${head_port} \
--model "$MODEL_PATH" \
--port "$ROUTER_PORT" \
--backend openai \
--input-len ${chosen_isl} \
--output-len ${chosen_osl} \
--random-range-ratio ${random_range_ratio} \
--num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
--input-len "$chosen_isl" \
--output-len "$chosen_osl" \
--random-range-ratio "$random_range_ratio" \
--num-prompts "$num_prompts" \
--max-concurrency "$max_concurrency" \
--result-filename "$export_file" \
--result-dir /workspace/ \
$( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
$extra_flags

echo "-----------------------------------------"

# vLLM: cooldown between rounds for idle KV block reaper
if [[ "$ENGINE" == "vllm-disagg" ]]; then
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
sleep 10
fi
done
Loading