From 5477ee4f946d39d5d4c3ef6a9d1a9d18946a7a00 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 11:19:28 +0000 Subject: [PATCH 01/31] [AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X Add multi-node vLLM PD disaggregation recipe using Nixl/RIXL KV transfer and vllm-router, mirroring the existing SGLang disagg recipe structure. - New benchmark config: dsr1-fp8-mi355x-vllm-disagg (1P2D, TP8) - New utils: vllm_disagg_utils/ (job.slurm, server.sh, submit.sh, etc.) - Runner: extend launch_mi355x-amds.sh for vllm-disagg framework --- .github/configs/amd-master.yaml | 72 +++ .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 47 ++ .../multi_node/vllm_disagg_utils/bench.sh | 70 +++ .../multi_node/vllm_disagg_utils/env.sh | 52 ++ .../multi_node/vllm_disagg_utils/job.slurm | 326 +++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 444 ++++++++++++++++++ .../vllm_disagg_utils/start_etcd.sh | 47 ++ .../multi_node/vllm_disagg_utils/submit.sh | 131 ++++++ .../multi_node/vllm_disagg_utils/sync.py | 198 ++++++++ runners/launch_mi355x-amds.sh | 15 +- 10 files changed, 1399 insertions(+), 3 deletions(-) create mode 100755 benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/bench.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/env.sh create mode 100644 benchmarks/multi_node/vllm_disagg_utils/job.slurm create mode 100755 benchmarks/multi_node/vllm_disagg_utils/server.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/submit.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/sync.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index de7f5e62a..06f4d6b37 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -994,6 +994,78 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" +dsr1-fp8-mi355x-vllm-disagg: + image: vllm_disagg_pd:latest + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 1024 + osl: 8192 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..a457a2714 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# vLLM disagg uses TP-only parallelism (no EP/DP). +# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh new file mode 100755 index 000000000..cfe66d460 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# vLLM Disaggregated Benchmark Runner +# +# Usage: bash bench.sh \ +# \ +# + +n_prefill=$1 +n_decode=$2 +prefill_gpus=$3 +decode_gpus=$4 +model_path=$5 +model_name=$6 +# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +log_path=$7 + +chosen_isl=${8:-1024} +chosen_osl=${9:-1024} +concurrency_list=${10:-"512x1"} +chosen_req_rate=${11:-inf} +random_range_ratio=${12:-0.8} +num_prompts_multiplier=${13:-10} + +IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" + +ROUTER_PORT="${ROUTER_PORT:-2584}" + +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" + +profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" + +for max_concurrency in "${chosen_concurrencies[@]}"; do + + export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + + echo "profile_folder: $profile_folder" + echo "max_concurrency: $max_concurrency" + echo "chosen_req_rate: $chosen_req_rate" + echo "MODEL_PATH: $MODEL_PATH" + echo "ROUTER_PORT: $ROUTER_PORT" + echo "chosen_isl: $chosen_isl" + echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" + echo "export_file: $export_file" + + vllm bench serve \ + --model "$MODEL_PATH" \ + --backend vllm \ + --host 127.0.0.1 \ + --port "$ROUTER_PORT" \ + --dataset-name "random" \ + --random-input-len "$chosen_isl" \ + --random-output-len "$chosen_osl" \ + --random-prefix-len 0 \ + --num-prompts "$num_prompts" \ + --request-rate "$chosen_req_rate" \ + --ignore-eos \ + --max-concurrency "$max_concurrency" \ + 2>&1 | tee "${export_file}.log" + + sleep 5 + echo "-----------------------------------------" +done diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh new file mode 100755 index 000000000..ebe77f09b --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# vLLM/Nixl environment setup for multi-node disaggregated serving. +# +# REQUIRED ENVIRONMENT VARIABLES: +# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) +# Set by runner or auto-detected from hostname. +# +# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). + +set -x + +# IBDEVICES configuration +# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) +# Fall back to hostname detection if not set (for direct script execution) +if [[ -z "$IBDEVICES" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 + elif [[ $NODENAME == mia1* ]]; then + export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + else + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" + else + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 + fi + fi + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" +else + echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" +else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" +fi + +export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} + +# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing +export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} + +set +x +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm new file mode 100644 index 000000000..710b7168a --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -0,0 +1,326 @@ +#!/bin/bash +#SBATCH --job-name=vllm-pd-bench +#SBATCH -N 4 # CHECK this to be right in batch jobs +#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH --ntasks-per-node=1 +#SBATCH --spread-job +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 +# --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR + +echo "=== Job Start Time ===" +echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" +echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "=======================" +echo "" + +# ============================================================================= +# Model Validation +# ============================================================================= + +VALID_MODELS=( + "Llama-3.1-405B-Instruct-FP8-KV" + "amd-Llama-3.3-70B-Instruct-FP8-KV" + "DeepSeek-V3" + "DeepSeek-R1-0528" + "gpt-oss-120b" +) + +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" +model_found=false +for m in "${VALID_MODELS[@]}"; do + [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break +done +if [[ "$model_found" != "true" ]]; then + echo "Error: Model '$MODEL_NAME' not found. Available:" + printf ' - %s\n' "${VALID_MODELS[@]}" + exit 1 +fi +echo "Model found: $MODEL_NAME" + +RUN_FILE="server.sh" +echo "Runfile set: $RUN_FILE" + +# DI_REPO_DIR points to the repo root. +# $(pwd) is vllm_disagg_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. +export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) + +xP="${xP:-1}" +yD="${yD:-1}" + +# Benchmark configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# ============================================================================= +# Model Path Resolution +# ============================================================================= + +# HF cache directory names may differ from MODEL_NAME +declare -A MODEL_DIR_NAMES=( + ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" +) + +# MODEL_DIR detection: prefer env var, fall back to hostname detection +if [[ -z "$MODEL_DIR" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + MODEL_DIR="/nfsdata" + elif [[ $NODENAME == mia1* ]]; then + MODEL_DIR="/it-share/data" + else + MODEL_DIR="/nfsdata" + fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" +fi +export MODEL_DIR + +DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + +resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 +} + +MODEL_PATH="" +SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" +) + +for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi +done + +if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 +fi +echo "Final MODEL_PATH: $MODEL_PATH" + +# ============================================================================= +# Node Selection and vLLM-Specific NUM_NODES +# ============================================================================= + +# vLLM needs xP + yD + 1 (dedicated proxy node) +NUM_NODES=$((xP + yD + 1)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" + +FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) +SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') + +# Update SLURM environment variables +export SLURM_NNODES=$NUM_NODES +export SLURM_NTASKS=$NUM_NODES +export SLURM_JOB_NUM_NODES=$NUM_NODES +export SLURM_NPROCS=$NUM_NODES +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" +export SLURM_NTASKS_PER_NODE=1 + +echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= + +USER_NAME=$(whoami) +MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) +NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') +NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') + +IPS=() +for NODE in $SELECTED_NODES; do + IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') + IP=$(echo "$IP" | awk '/src/ {print $7}') + IPS+=("$IP") +done + +echo "Node IPs: ${IPS[*]}" + +DOCKER_MOUNT_PATH="/workspace" +VLLM_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/vllm_disagg_utils" + +NNODES=$NUM_NODES + +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" + +# Reduce log spam +export TQDM_MININTERVAL=20 + +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + +export DI_REPO_DIR=$DI_REPO_DIR +export VLLM_WS_PATH=$VLLM_WS_PATH +export NNODES=$NNODES +export NODE0_ADDR=$NODE0_ADDR +export MODEL_PATH=$MODEL_PATH +export MODEL_DIR=$MODEL_DIR +export xP=$xP +export yD=$yD +export MODEL_NAME=$MODEL_NAME +export USER_NAME=$USER_NAME +export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" +export GPUS_PER_NODE=$GPUS_PER_NODE +export BENCH_INPUT_LEN=$BENCH_INPUT_LEN +export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN +export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO +export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER +export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE +export DRY_RUN="${DRY_RUN:-0}" +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" + +SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') +export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" + +SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) + +cleanup() { + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + echo "[${SLURM_JOB_ID}] cleanup done." +} + +trap cleanup INT TERM HUP + +# Force NFS cache refresh on all nodes +echo "Refreshing NFS caches on all nodes..." +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' + sync + ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils > /dev/null 2>&1 + stat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + cat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true + echo "NFS cache refreshed on $(hostname)" +' + +srun \ + --nodelist="$SELECTED_NODELIST_SRUN" \ + --kill-on-bad-exit=1 \ + --signal=TERM@30 \ + --unbuffered \ + bash -lc " +set -euo pipefail + +echo \"Rank \$SLURM_PROCID on \$(hostname)\" + +# Pre-clean (idempotent) +sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true +sudo docker ps -aq | xargs -r sudo docker stop || true + +exec sudo docker run --rm \ + --init \ + --stop-timeout 10 \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --device=/dev/infiniband/rdma_cm \ + --device=/dev/infiniband/uverbs0 \ + --device=/dev/infiniband/uverbs1 \ + --device=/dev/infiniband/uverbs2 \ + --device=/dev/infiniband/uverbs3 \ + --device=/dev/infiniband/uverbs4 \ + --device=/dev/infiniband/uverbs5 \ + --device=/dev/infiniband/uverbs6 \ + --device=/dev/infiniband/uverbs7 \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v /sys:/sys \ + -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ + -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ + -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ + -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + -v ${MODEL_DIR}:/models \ + -v \$HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + -v /tmp:/run_logs \ + -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -e SLURM_JOB_ID=\$SLURM_JOB_ID \ + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ + -e NNODES=\$NNODES \ + -e NODE_RANK=\$SLURM_PROCID \ + -e NODE0_ADDR=\$NODE0_ADDR \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=\$MODEL_NAME \ + -e MODEL_PATH=$DOCKER_MODEL_PATH \ + -e VLLM_WS_PATH=${VLLM_WS_PATH} \ + -e GPUS_PER_NODE=\$GPUS_PER_NODE \ + -e xP=\$xP \ + -e yD=\$yD \ + -e IPADDRS=\$IPADDRS \ + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE \ + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ + -e DRY_RUN=\$DRY_RUN \ + -e BENCHMARK_LOGS_DIR=/benchmark_logs \ + -e UCX_TLS=all \ + -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ + -e UCX_MEMTYPE_CACHE=y \ + -e UCX_RNDV_SCHEME=get_zcopy \ + -e UCX_RNDV_THRESH=4k \ + -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ + -e UCX_LOG_LEVEL=info \ + -e HSA_ENABLE_SDMA=1 \ + --name \"$DOCKER_CONT_NAME\" \ + \"$DOCKER_IMAGE_NAME\" bash -lc ' + mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' + '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log + ' + +DOCKER_EXIT_CODE=\$? +if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then + echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" + exit \$DOCKER_EXIT_CODE +fi +" + +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh new file mode 100755 index 000000000..b4ab7bce8 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -0,0 +1,444 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router node +# 1..xP -> Prefill nodes (kv_producer) +# xP+1..xP+yD -> Decode nodes (kv_consumer) + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-2584}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $VLLM_WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# --------------------------------------------------------------------------- +# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. +# Each benic interface has a /31 to the TOR switch. Without explicit routes, +# traffic to other nodes' RDMA IPs falls through to the management network +# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 +# stays on the ionic fabric. +# --------------------------------------------------------------------------- +if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + rdma_subnet="${BASH_REMATCH[1]}" + rdma_host="${BASH_REMATCH[2]}" + rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi +fi + +# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory +# transport compatibility (Pensando ionic NICs don't support rdmacm, so the +# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) +NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) +if [[ -n "$NIXL_API_FILE" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" + echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" + else + echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + fi +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration Maps +# ============================================================================= + +declare -A MODEL_PREFILL_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_DECODE_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_ENVS=( + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" +) + +get_model_config() { + local mode="$1" + local model_name="$2" + if [[ "$mode" == "prefill" ]]; then + echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + elif [[ "$mode" == "decode" ]]; then + echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + fi +} + +get_model_envs() { + echo "${MODEL_ENVS[$1]:-""}" +} + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") +DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") +PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +echo "Using model-specific configuration for: $MODEL_NAME" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do + PREFILL_ARGS+="${IP_ARRAY[$i]} " +done + +for ((i=xP+1; i<${#IP_ARRAY[@]}; i++)); do + DECODE_ARGS+="${IP_ARRAY[$i]} " +done + +echo "Prefill node IPs: ${PREFILL_ARGS}" +echo "Decode node IPs: ${DECODE_ARGS}" + +# Common UCX/Nixl environment for prefill and decode workers +setup_ucx_env() { + export UCX_TLS=all + export UCX_SOCKADDR_TLS_PRIORITY=tcp + export UCX_MEMTYPE_CACHE=y + export UCX_RNDV_SCHEME=get_zcopy + export UCX_RNDV_THRESH=4k + export UCX_ROCM_IPC_MIN_ZCOPY=0 + export HSA_ENABLE_SDMA=1 + export UCX_LOG_LEVEL=info + export VLLM_USE_V1=1 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 +} + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node" + echo "Prefill servers: ${PREFILL_ARGS}" + echo "Decode servers: ${DECODE_ARGS}" + echo "================================================" + + PD_IPADDRS="${IPADDRS#*,}" + echo "Waiting for all prefill and decode servers to be up . . ." + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${PD_IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + echo "Starting vLLM Router..." + [ -f /root/.cargo/env ] && source /root/.cargo/env + + PREFILL_URLS="" + DECODE_URLS="" + for ip in ${PREFILL_ARGS}; do + PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " + done + for ip in ${DECODE_ARGS}; do + DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " + done + + ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ + vllm-router \ + --host 0.0.0.0 \ + --port $ROUTER_PORT \ + --vllm-pd-disaggregation \ + $PREFILL_URLS \ + $DECODE_URLS \ + --policy round_robin \ + --prefill-policy round_robin \ + --decode-policy round_robin \ + --intra-node-data-parallel-size 1 \ + --retry-max-retries 3 \ + --health-check-endpoint /health \ + --prometheus-port 29000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" + set -x + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + set +x + proxy_pid=$! + + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $VLLM_WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $VLLM_WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server" + [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${PREFILL_MODEL_ENVS}; do + export "$env_pair" + done + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid +fi + +echo "Killing the etcd server" +kill $etcd_pid + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh new file mode 100755 index 000000000..a41a31d79 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Cluster Configuration Template for Multi-Node vLLM Disaggregated Serving +# +# This script submits a multi-node vLLM disaggregated benchmark job to SLURM. +# It must be configured for your specific cluster before use. +# +# Key difference from SGLang: vLLM uses a dedicated proxy node, so +# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. + +usage() { + cat << 'USAGE' +Usage: + bash submit.sh \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) +USAGE +} + +check_env() { + local name="$1" + if [[ -z "${!name:-}" ]]; then + echo "Error: ${name} not specified" >&2 + usage >&2 + exit 1 + fi +} + +check_env SLURM_ACCOUNT +check_env SLURM_PARTITION +check_env TIME_LIMIT + +check_env MODEL_PATH +check_env MODEL_NAME +check_env CONTAINER_IMAGE +check_env RUNNER_NAME + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# COMMAND_LINE ARGS +PREFILL_NODES=$1 +PREFILL_WORKERS=${2:-1} +DECODE_NODES=$3 +DECODE_WORKERS=${4:-1} +ISL=$5 +OSL=$6 +CONCURRENCIES=$7 +REQUEST_RATE=$8 +NODE_LIST=${9} + +# vLLM needs xP + yD + 1 nodes (dedicated proxy node) +NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" + +# Export variables for the SLURM job +export MODEL_DIR=$MODEL_PATH +export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE +export PROFILER_ARGS=$profiler_args + +# For vLLM, each worker = 1 node (TP=8 per node). +# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +export xP=$PREFILL_NODES +export yD=$DECODE_NODES +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME +export BENCH_INPUT_LEN=${ISL} +export BENCH_OUTPUT_LEN=${OSL} +export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} +export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} +export BENCH_REQUEST_RATE=${REQUEST_RATE} + +# Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +mkdir -p "$BENCHMARK_LOGS_DIR" + +# Optional: pass an explicit node list to sbatch. +NODELIST_OPT=() +if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then + IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" + if [[ "${#NODE_ARR[@]}" -ne "$NUM_NODES" ]]; then + echo "Error: NODE_LIST has ${#NODE_ARR[@]} nodes but NUM_NODES=${NUM_NODES}" >&2 + echo "Error: NODE_LIST='${NODE_LIST}'" >&2 + exit 1 + fi + NODELIST_CSV="$(IFS=,; echo "${NODE_ARR[*]}")" + NODELIST_OPT=(--nodelist "$NODELIST_CSV") +fi + +# Construct the sbatch command +sbatch_cmd=( + sbatch + --parsable + -N "$NUM_NODES" + -n "$NUM_NODES" + "${NODELIST_OPT[@]}" + --time "$TIME_LIMIT" + --partition "$SLURM_PARTITION" + --account "$SLURM_ACCOUNT" + --job-name "$RUNNER_NAME" + --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" + --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" + "$(dirname "$0")/job.slurm" +) + +JOB_ID=$("${sbatch_cmd[@]}") +if [[ $? -ne 0 ]]; then + echo "Error: Failed to submit job with sbatch" >&2 + exit 1 +fi +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py new file mode 100755 index 000000000..140951519 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Multi-node synchronization utilities for disaggregated inference. + +Subcommands: + barrier - Wait until all specified nodes have opened their ports (TCP barrier) + Optionally wait for HTTP health endpoints to return 200 + wait - Block until a remote port closes (shutdown coordination) +""" + +import socket +import time +import threading +import argparse +import sys +import urllib.request +import urllib.error + + +def is_port_open(ip, port, timeout=2): + """Check if a given IP and port are accessible.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(timeout) + return s.connect_ex((ip, port)) == 0 + + +def check_health(ip, port, path="/health", timeout=2): + """Return True if http://ip:port/path returns HTTP 200.""" + try: + url = f"http://{ip}:{port}{path}" + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return getattr(resp, "status", 200) == 200 + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + return False + + +# ============================================================================= +# barrier subcommand +# ============================================================================= + +def cmd_barrier(args): + """Wait until all nodes have opened the specified ports.""" + NODE_IPS = [ip.strip() for ip in args.node_ips.split(",") if ip.strip()] + NODE_PORTS = [int(p.strip()) for p in args.node_ports.split(",") if p.strip()] + + if not NODE_IPS: + print("Error: NODE_IPS argument is empty or not set.") + sys.exit(1) + + if len(NODE_PORTS) == 1: + NODE_PORTS *= len(NODE_IPS) + elif len(NODE_PORTS) != len(NODE_IPS): + print("Error: Number of ports must match number of node IPs or only one port should be given for all.") + sys.exit(1) + + server_socket = None + + def open_port(): + nonlocal server_socket + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server_socket.bind((args.local_ip, args.local_port)) + server_socket.listen(5) + print(f"Port {args.local_port} is now open on {args.local_ip}.") + while True: + conn, addr = server_socket.accept() + conn.close() + + def close_port(): + nonlocal server_socket + if server_socket: + server_socket.close() + print(f"Port {args.local_port} has been closed on {args.local_ip}.") + + if args.enable_port: + threading.Thread(target=open_port, daemon=True).start() + + # Wait for all ports (TCP check) + if args.wait_for_all_ports: + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_open = [(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS) + if not is_port_open(ip, port)] + print(f"ERROR: Timeout after {timeout} seconds waiting for ports to open.", flush=True) + print("The following nodes/ports are still not responding:", flush=True) + for ip, port in not_open: + print(f" - {ip}:{port}", flush=True) + sys.exit(1) + + all_open = all(is_port_open(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS)) + if all_open: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . . ({remaining:.0f}s remaining)", flush=True) + else: + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . .", flush=True) + time.sleep(5) + + # Wait for all health endpoints (HTTP check) + if args.wait_for_all_health: + health_path = args.health_endpoint + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_ready = [ + (ip, port) + for ip, port in zip(NODE_IPS, NODE_PORTS) + if not check_health(ip, port, health_path) + ] + print(f"ERROR: Timeout after {timeout} seconds waiting for health endpoints.", flush=True) + print(f"The following (http://ip:port{health_path}) are still not responding:", flush=True) + for ip, port in not_ready: + print(f" - http://{ip}:{port}{health_path}", flush=True) + sys.exit(1) + + all_ready = all( + check_health(ip, port, health_path) + for ip, port in zip(NODE_IPS, NODE_PORTS) + ) + if all_ready: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print( + f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) .. ({remaining:.0f}s remaining)", + flush=True, + ) + else: + print(f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) ..", flush=True) + time.sleep(30) + + if args.enable_port: + time.sleep(30) + close_port() + + +# ============================================================================= +# wait subcommand +# ============================================================================= + +def cmd_wait(args): + """Wait while a remote port remains open, exit when it closes.""" + print(f"Waiting while port {args.remote_port} on {args.remote_ip} is open...") + while is_port_open(args.remote_ip, args.remote_port): + time.sleep(5) + print(f"Port {args.remote_port} on {args.remote_ip} is now closed.") + + +# ============================================================================= +# CLI +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser(description="Multi-node synchronization utilities.") + subparsers = parser.add_subparsers(dest="command", required=True) + + # barrier subcommand + bp = subparsers.add_parser("barrier", help="Wait for all nodes to open specified ports.") + bp.add_argument("--local-ip", required=False, help="Local IP address to bind the server.") + bp.add_argument("--local-port", type=int, required=False, help="Port number to bind the server.") + bp.add_argument("--enable-port", action="store_true", help="Enable opening and closing of local port.") + bp.add_argument("--node-ips", required=True, help="Comma-separated list of node IPs.") + bp.add_argument("--node-ports", required=True, help="Comma-separated list of ports to check.") + bp.add_argument("--timeout", type=int, default=600, + help="Timeout in seconds (default: 600). Set to 0 for no timeout.") + bp.add_argument("--wait-for-all-ports", action="store_true", + help="Wait until all node ports are open (TCP).") + bp.add_argument("--wait-for-all-health", action="store_true", + help="Wait until http://ip:port/health returns 200 for all nodes.") + bp.add_argument("--health-endpoint", default="/health", + help="Path for health check (default: /health).") + bp.set_defaults(func=cmd_barrier) + + # wait subcommand + wp = subparsers.add_parser("wait", help="Wait while a remote port remains open.") + wp.add_argument("--remote-ip", required=True, help="Remote server IP address.") + wp.add_argument("--remote-port", type=int, required=True, help="Remote port number.") + wp.set_defaults(func=cmd_wait) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 5e3225b81..edbeb0614 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" @@ -108,8 +108,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +prefixes = ["sglang", "vllm"] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + for pfx in prefixes: + subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY From 326b31d3331d18cfc4fd9aaa8e506a412167826e Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 17:50:16 +0000 Subject: [PATCH 02/31] [AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support Extract hardcoded model configurations from server.sh bash maps and job.slurm VALID_MODELS into a declarative models.yaml, mirroring the SGLang disagg recipe pattern. Adding a new model now requires no script changes. Also: - Consolidate UCX transport vars in job.slurm Docker env; remove duplicated setup_ucx_env() from server.sh - Extract RDMA workarounds (ionic /31 route fix, Nixl UCX patch) into setup_rdma_env() helper - Lower UCX_LOG_LEVEL from info to warn - Add nicctl mount and QoS/DSCP auto-detection to env.sh - Remove stale host libionic bind-mounts (driver now built into image) --- .../multi_node/vllm_disagg_utils/env.sh | 54 +++++- .../multi_node/vllm_disagg_utils/job.slurm | 46 +++-- .../multi_node/vllm_disagg_utils/models.yaml | 41 +++++ .../multi_node/vllm_disagg_utils/server.sh | 162 ++++++++---------- 4 files changed, 184 insertions(+), 119 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/models.yaml diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index ebe77f09b..f4340e812 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -33,9 +33,17 @@ else fi if [[ -z "$UCX_NET_DEVICES" ]]; then - FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) - if [[ -n "$FIRST_IB" ]]; then - export UCX_NET_DEVICES="${FIRST_IB}:1" + # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC). + # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1) + # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider). + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi fi echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" else @@ -48,5 +56,43 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} +# QoS/DSCP configuration for lossless RoCEv2 fabric. +# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname +if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" +elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi +else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi +fi + set +x -echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 710b7168a..494ef6901 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -18,13 +18,14 @@ echo "" # Model Validation # ============================================================================= -VALID_MODELS=( - "Llama-3.1-405B-Instruct-FP8-KV" - "amd-Llama-3.3-70B-Instruct-FP8-KV" - "DeepSeek-V3" - "DeepSeek-R1-0528" - "gpt-oss-120b" -) +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (vllm_disagg_utils/). +MODELS_YAML="$(pwd)/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "Error: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then echo "Error: DOCKER_IMAGE_NAME is not set." @@ -32,13 +33,10 @@ if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then fi MODEL_NAME="${MODEL_NAME:-None}" -model_found=false -for m in "${VALID_MODELS[@]}"; do - [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break -done -if [[ "$model_found" != "true" ]]; then - echo "Error: Model '$MODEL_NAME' not found. Available:" - printf ' - %s\n' "${VALID_MODELS[@]}" +if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then + echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Available models:" + grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" @@ -67,11 +65,6 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # Model Path Resolution # ============================================================================= -# HF cache directory names may differ from MODEL_NAME -declare -A MODEL_DIR_NAMES=( - ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" -) - # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) @@ -86,7 +79,11 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +# Extract hf_dir from models.yaml (the line after the model's top-level key) +DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") +DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" resolve_hf_cache_path() { @@ -270,10 +267,7 @@ exec sudo docker run --rm \ --security-opt seccomp=unconfined \ --privileged \ -v /sys:/sys \ - -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ - -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ - -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ - -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ --shm-size 128G \ @@ -302,13 +296,13 @@ exec sudo docker run --rm \ -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ -e DRY_RUN=\$DRY_RUN \ -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e UCX_TLS=all \ + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ -e UCX_MEMTYPE_CACHE=y \ -e UCX_RNDV_SCHEME=get_zcopy \ -e UCX_RNDV_THRESH=4k \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ - -e UCX_LOG_LEVEL=info \ + -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml new file mode 100644 index 000000000..31197ec52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -0,0 +1,41 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--deepseek-ai--DeepSeek-R1-0528 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +DeepSeek-V3: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + +DeepSeek-R1-0528: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index b4ab7bce8..21fe506cb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -53,37 +53,43 @@ host_name=$(hostname) echo "[INFO] Management IP (barriers/proxy): $host_ip" echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" -# --------------------------------------------------------------------------- -# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. -# Each benic interface has a /31 to the TOR switch. Without explicit routes, -# traffic to other nodes' RDMA IPs falls through to the management network -# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 -# stays on the ionic fabric. -# --------------------------------------------------------------------------- -if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then - rdma_subnet="${BASH_REMATCH[1]}" - rdma_host="${BASH_REMATCH[2]}" - rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch - rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) - if [[ -n "$rdma_iface" ]]; then - ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ - echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ - echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi fi -fi -# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory -# transport compatibility (Pensando ionic NICs don't support rdmacm, so the -# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) -NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) -if [[ -n "$NIXL_API_FILE" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" - echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" - else - echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Pensando ionic NICs don't support rdmacm, so the default + # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi -fi +} + +setup_rdma_env if [[ -z "$UCX_NET_DEVICES" ]]; then echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 @@ -91,56 +97,45 @@ if [[ -z "$UCX_NET_DEVICES" ]]; then fi # ============================================================================= -# Model-Specific Configuration Maps +# Model-Specific Configuration from YAML # ============================================================================= +MODELS_YAML="${VLLM_WS_PATH}/models.yaml" -declare -A MODEL_PREFILL_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_DECODE_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_ENVS=( - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" -) - -get_model_config() { - local mode="$1" - local model_name="$2" - if [[ "$mode" == "prefill" ]]; then - echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - elif [[ "$mode" == "decode" ]]; then - echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - fi -} - -get_model_envs() { - echo "${MODEL_ENVS[$1]:-""}" -} +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "$MODEL_NAME" ]]; then echo "ERROR: MODEL_NAME is not set"; exit 1 fi -PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") -DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") -PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -echo "Using model-specific configuration for: $MODEL_NAME" +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" # ============================================================================= # Container Synchronization @@ -203,20 +198,15 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# Common UCX/Nixl environment for prefill and decode workers -setup_ucx_env() { - export UCX_TLS=all - export UCX_SOCKADDR_TLS_PRIORITY=tcp - export UCX_MEMTYPE_CACHE=y - export UCX_RNDV_SCHEME=get_zcopy - export UCX_RNDV_THRESH=4k - export UCX_ROCM_IPC_MIN_ZCOPY=0 - export HSA_ENABLE_SDMA=1 - export UCX_LOG_LEVEL=info +# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + for env_pair in ${MODEL_ENVS}; do + export "$env_pair" + done } # ============================================================================= @@ -334,10 +324,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${PREFILL_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -387,10 +374,7 @@ else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" echo "Using decode config: $DECODE_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${DECODE_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ From 7d9eb5124c241433b43ca87fd2a0404370faa5e8 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 20:20:51 +0000 Subject: [PATCH 03/31] [AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API Adapt server.sh to vLLM v0.17.1 breaking changes: - Use simplified kv-transfer-config (side channel via env vars instead of kv_ip/kv_port, add kv_load_failure_policy) - Remove deprecated --disable-log-requests (disabled by default in v0.17) - Route NIXL side channel through RDMA IP for correct fabric path - Fix RIXL ucx_error_handling_mode patch for updated _api.py layout --- benchmarks/multi_node/vllm_disagg_utils/env.sh | 2 +- benchmarks/multi_node/vllm_disagg_utils/server.sh | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index f4340e812..cc9b9320b 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -6,7 +6,7 @@ # Set by runner or auto-detected from hostname. # # The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 21fe506cb..d90e4b240 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -81,7 +81,7 @@ setup_rdma_env() { nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) if [[ -n "$nixl_api" ]]; then if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" else echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" @@ -202,8 +202,8 @@ echo "Decode node IPs: ${DECODE_ARGS}" setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 - export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} - export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -329,8 +329,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -379,8 +378,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then From 02a547ca395f133e7201f8ea5aa541d56ff56a09 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 12:13:36 +0000 Subject: [PATCH 04/31] [AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bench.sh: replace `vllm bench serve` (log-only output) with the shared run_benchmark_serving helper from benchmark_lib.sh, matching the SGLang disagg pattern. This produces the .json result files that the multinode CI workflow expects (benchmark-multinode-tmpl.yml → process_result.py). server.sh: make the Nixl ucx_error_handling_mode=none runtime patch conditional on Pensando ionic RDMA devices (IBDEVICES=*ionic*). On the mia1 cluster (ConnectX/mlx5, IBDEVICES=rdma*), UCX handles error mode natively and the patch is skipped. Model-path resolution and IBDEVICES/UCX/QoS auto-detection were verified to already work on mia1 — no changes needed. Tested locally (Job 2802, 1P+2D, ISL/OSL=1024): conc 8 → 507 tok/s conc 32 → 1778 tok/s conc 16 → 1004 tok/s conc 64 → 2480 tok/s All four .json result files produced; 100% external prefix cache hit rate. --- .../multi_node/vllm_disagg_utils/bench.sh | 27 ++++++++++--------- .../multi_node/vllm_disagg_utils/server.sh | 23 +++++++++------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index cfe66d460..69a178ca4 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -1,6 +1,9 @@ #!/bin/bash # vLLM Disaggregated Benchmark Runner # +# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh) +# so that the CI pipeline can collect and process results. +# # Usage: bash bench.sh \ # \ # @@ -11,7 +14,6 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 @@ -31,6 +33,10 @@ echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_ profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" mkdir -p "$profile_folder" +source "$(dirname "$0")/../../benchmark_lib.sh" + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" + for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" @@ -50,21 +56,18 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "num_prompts: $num_prompts" echo "export_file: $export_file" - vllm bench serve \ + run_benchmark_serving \ + --bench-serving-dir "$REPO_ROOT" \ --model "$MODEL_PATH" \ - --backend vllm \ - --host 127.0.0.1 \ --port "$ROUTER_PORT" \ - --dataset-name "random" \ - --random-input-len "$chosen_isl" \ - --random-output-len "$chosen_osl" \ - --random-prefix-len 0 \ + --backend openai \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ --num-prompts "$num_prompts" \ - --request-rate "$chosen_req_rate" \ - --ignore-eos \ --max-concurrency "$max_concurrency" \ - 2>&1 | tee "${export_file}.log" + --result-filename "$export_file" \ + --result-dir /workspace/ - sleep 5 echo "-----------------------------------------" done diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d90e4b240..933019abe 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -75,17 +75,22 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Pensando ionic NICs don't support rdmacm, so the default + # Only needed for Pensando ionic NICs which don't support rdmacm — the default # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. + if [[ "${IBDEVICES:-}" == *ionic* ]]; then + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi + else + echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } From ab656e1423f70c4aa9b07df9f4ca3874f592c937 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 13:46:47 +0000 Subject: [PATCH 05/31] [AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0 Move the vllm-router from a dedicated proxy node onto the first prefill node, mirroring SGLang's co-location pattern. This reduces the node count from xP + yD + 1 to xP + yD (e.g., 3 nodes instead of 4 for 1P+2D). - server.sh: NODE_RANK=0 now runs both vllm serve (prefill, port 2584) and vllm-router (port 30000); barrier waits on all nodes - submit.sh / job.slurm: NUM_NODES = PREFILL_NODES + DECODE_NODES - bench.sh: ROUTER_PORT default updated to 30000 Local 1P+2D benchmark (ISL/OSL=1024, DeepSeek-R1 FP8, MI355X): - Throughput: +1.6% to +8.4% across concurrency 8-64 - Mean TTFT: -22% to -63% (prefill is local to router) - TPOT/ITL: unchanged (within noise) - 25% fewer nodes, no performance regression --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/bench.sh | 2 +- .../multi_node/vllm_disagg_utils/job.slurm | 10 ++-- .../multi_node/vllm_disagg_utils/server.sh | 49 ++++++++++++++----- .../multi_node/vllm_disagg_utils/submit.sh | 10 ++-- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 06f4d6b37..0c8ce6c07 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1007,7 +1007,7 @@ dsr1-fp8-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 69a178ca4..37b9d0b56 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -26,7 +26,7 @@ num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 494ef6901..7b25fd4b5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=vllm-pd-bench -#SBATCH -N 4 # CHECK this to be right in batch jobs -#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job #SBATCH --gres=gpu:8 @@ -127,9 +127,9 @@ echo "Final MODEL_PATH: $MODEL_PATH" # Node Selection and vLLM-Specific NUM_NODES # ============================================================================= -# vLLM needs xP + yD + 1 (dedicated proxy node) -NUM_NODES=$((xP + yD + 1)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" +# Router co-located with first prefill: xP + yD nodes total (same as SGLang) +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located with first prefill)" FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 933019abe..8447046c1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -3,9 +3,11 @@ # ============================================================================= # # Node role assignment (by NODE_RANK): -# 0 -> Proxy/Router node -# 1..xP -> Prefill nodes (kv_producer) -# xP+1..xP+yD -> Decode nodes (kv_consumer) +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). # ============================================================================= # Environment Configuration @@ -32,7 +34,7 @@ BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" DRY_RUN="${DRY_RUN:-0}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" SERVER_PORT="${SERVER_PORT:-2584}" ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" @@ -192,11 +194,11 @@ IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" PREFILL_ARGS="" DECODE_ARGS="" -for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do +for ((i=0; i&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + echo "Waiting for all prefill and decode servers to be up . . ." python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${PD_IPADDRS} \ + --node-ips ${IPADDRS} \ --node-ports $SERVER_PORT \ --wait-for-all-ports \ --timeout 1800 @@ -322,11 +342,14 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server" - [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill_pid + fi -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" setup_vllm_env diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index a41a31d79..d60ed87e6 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -5,8 +5,8 @@ # This script submits a multi-node vLLM disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. # -# Key difference from SGLang: vLLM uses a dedicated proxy node, so -# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. +# Router is co-located with the first prefill node (same as SGLang), so +# NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' @@ -67,8 +67,8 @@ CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} -# vLLM needs xP + yD + 1 nodes (dedicated proxy node) -NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +# Router co-located with first prefill: xP + yD nodes total +NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job @@ -77,7 +77,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # For vLLM, each worker = 1 node (TP=8 per node). -# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +# xP/yD must match the node counts so NUM_NODES = xP+yD is correct. export xP=$PREFILL_NODES export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES From 6bb39b4bf9b128c9f32bdcc1dc38b612aa5832ed Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 17:31:07 +0000 Subject: [PATCH 06/31] [AMD] Use public vLLM base image with runtime dependency install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the custom Docker image (vllm_disagg_pd:latest) with the public vllm/vllm-openai-rocm:v0.17.1 base image. Missing components (UCX, RIXL, etcd, libionic1, vllm-router) are now installed at container start via setup_deps.sh, which is sourced by server.sh. This eliminates the need to build, host, and maintain a custom image — CI nodes can pull directly from Docker Hub. Changes: - Add setup_deps.sh: idempotent installer for UCX (ROCm fork), RIXL, etcd, libionic1 (Pensando ionic), and vllm-router (NODE_RANK=0 only). Build steps run in subshells to avoid CWD pollution. - server.sh: source setup_deps.sh before any other logic - job.slurm: add --entrypoint "" to override the base image's vllm CLI entrypoint, allowing bash -lc to work correctly - env.sh: update comment (paths now set by setup_deps.sh, not image ENV) - amd-master.yaml: image changed to vllm/vllm-openai-rocm:v0.17.1 Tested locally (Job 2807, 3 nodes, ISL/OSL=1024): Setup overhead: ~2.5 min per node (all components built from source) Benchmark completed successfully across concurrency 8/16/32/64 --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/env.sh | 4 +- .../multi_node/vllm_disagg_utils/job.slurm | 1 + .../multi_node/vllm_disagg_utils/server.sh | 5 + .../vllm_disagg_utils/setup_deps.sh | 186 ++++++++++++++++++ 5 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0c8ce6c07..daa3c2806 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -995,7 +995,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp8-mi355x-vllm-disagg: - image: vllm_disagg_pd:latest + image: vllm/vllm-openai-rocm:v0.17.1 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index cc9b9320b..e1cc2f6af 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -5,8 +5,8 @@ # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) # Set by runner or auto-detected from hostname. # -# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). +# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is +# sourced at the top of server.sh before this file. set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 7b25fd4b5..3a71436fe 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -305,6 +305,7 @@ exec sudo docker run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8447046c1..efabf5e32 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -9,6 +9,11 @@ # # Total nodes = xP + yD (router co-located with first prefill, like SGLang). +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + # ============================================================================= # Environment Configuration # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh new file mode 100644 index 000000000..ee2524979 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.17.1 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. vllm-router (Rust-based proxy for PD disaggregation) +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_vllm_router() { + if pip show vllm-router &>/dev/null; then + echo "[SETUP] vllm-router already installed" + return 0 + fi + + echo "[SETUP] Installing Rust toolchain..." + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + export PATH="/root/.cargo/bin:${PATH}" + fi + + echo "[SETUP] Installing vllm-router via pip..." + pip install --quiet vllm-router + + if ! pip show vllm-router &>/dev/null; then + echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + fi + _SETUP_INSTALLED+=("vllm-router") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic + +if [[ "${NODE_RANK:-0}" -eq 0 ]]; then + install_vllm_router +fi + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi From b4dad14e7f428d87618f25e63a890cc9b44ec05b Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 14:19:12 +0000 Subject: [PATCH 07/31] [AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg decode Enable MoRI-based Expert Parallelism (--enable-expert-parallel --all2all-backend mori) on decode workers for DeepSeek-R1-0528, while keeping TP=8 to preserve KV cache transfer compatibility with the prefill node via NixlConnector. This matches SGLang's approach of TP=8 + EP within the TP group. KV Transfer: RIXL/NixlConnector (unchanged) MoE All-to-All: NCCL (default) -> MoRI-EP (--all2all-backend mori) Changes: - models.yaml: Add --enable-expert-parallel --all2all-backend mori to decode_flags; increase engine ready timeout to 1200s - setup_deps.sh: Add MoRI install and vLLM v0.17.1 patches for MoRI-EP + FP8 compatibility (AITER assertion, defer_input_quant) - server.sh: Support decode_env from models.yaml for decode-specific environment overrides - dsr1_fp8_mi355x_vllm-disagg.sh: Pass NODELIST to submit.sh for Slurm node constraints --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 4 +- .../multi_node/vllm_disagg_utils/models.yaml | 4 +- .../multi_node/vllm_disagg_utils/server.sh | 7 ++ .../vllm_disagg_utils/setup_deps.sh | 85 +++++++++++++++++++ 4 files changed, 96 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index a457a2714..167aff5f3 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -30,14 +30,14 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# vLLM disagg uses TP-only parallelism (no EP/DP). # PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. +# NODELIST (optional) constrains which Slurm nodes are used. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf) + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 31197ec52..4a720785a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -31,8 +31,8 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index efabf5e32..7778dfd34 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -142,9 +142,11 @@ def bash_escape(s): pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') print(f'DECODE_SERVER_CONFIG=\"{df}\"') print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') ")" echo "Loaded model configuration for: $MODEL_NAME" @@ -408,6 +410,11 @@ else setup_vllm_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index ee2524979..8e2276d1c 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -155,6 +155,89 @@ install_vllm_router() { _SETUP_INSTALLED+=("vllm-router") } +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# --------------------------------------------------------------------------- +install_mori() { + if python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] MoRI Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + ( + set -e + git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout b645fc8 + pip install --quiet . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + _SETUP_INSTALLED+=("MoRI") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM v0.17.1 MoRI-EP + FP8 incompatibility +# v0.17.1 asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + # ============================================================================= # Run installers # ============================================================================= @@ -163,6 +246,8 @@ install_ucx install_rixl install_etcd install_libionic +install_mori +patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_vllm_router From d2b9332374652707d250a88b2c3e9384f4dfd0fd Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 23:25:36 +0000 Subject: [PATCH 08/31] [AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace NixlConnector with MoRIIOConnector for KV cache transfer and replace the Rust-based vllm-router with a MoRI-IO-aware Python proxy that handles both HTTP routing and ZMQ-based RDMA endpoint discovery. The key architectural change is that the proxy enriches each request's kv_transfer_params with remote RDMA endpoint info (handshake_port, notify_port, host, port) before dispatching, enabling concurrent prefill+decode in WRITE mode — something vllm-router could not do because it only understands HTTP, not the MoRI-IO registration protocol. Changes: - Add moriio_proxy.py: MoRI-IO-aware proxy with ZMQ service discovery, request enrichment, and /health endpoint (adapted from vLLM upstream moriio_toy_proxy_server.py) - server.sh: switch --kv-transfer-config from NixlConnector to MoRIIOConnector with kv_connector_extra_config (proxy_ip, proxy_ping_port, http_port); launch proxy before prefill on NODE_RANK=0; set VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 as workaround for v0.17.1 completion-ID mismatch (upstream fix: vllm-project/vllm#34907) - setup_deps.sh: replace vllm-router/Rust install with lightweight Python deps (quart, aiohttp, msgpack, pyzmq) for the proxy Benchmark (Job 2853 vs 2818 NixlConnector baseline, ISL/OSL=1024): TTFT median: -37% to -55% across C8–C64 (e.g. 384→241ms @C64) TTFT p99: -63% at C64 (6622→2469ms) Throughput: +8% at C64 (2634→2844 tok/s) TPOT: unchanged (~22ms @C64) --- .../vllm_disagg_utils/moriio_proxy.py | 309 ++++++++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 87 ++--- .../vllm_disagg_utils/setup_deps.sh | 29 +- 3 files changed, 358 insertions(+), 67 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py new file mode 100644 index 000000000..82272dd52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {} + req_data["kv_transfer_params"] = {} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 7778dfd34..f81ff68e1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -212,12 +212,18 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + +# vLLM environment (UCX transport vars are set at the Docker level in job.slurm) setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 + # Workaround: disable request-ID randomization so MoRI-IO connector can + # match completion IDs between prefill and decode without PR #34907 patch. + export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -245,10 +251,26 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env + # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $VLLM_WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + set +x + proxy_pid=$! + sleep 3 + fi + PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -270,56 +292,19 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Congratulations!!! All prefill and decode servers are up . . ." - echo "Starting vLLM Router..." - [ -f /root/.cargo/env ] && source /root/.cargo/env - - PREFILL_URLS="" - DECODE_URLS="" - for ip in ${PREFILL_ARGS}; do - PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " - done - for ip in ${DECODE_ARGS}; do - DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " - done - - ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ - vllm-router \ - --host 0.0.0.0 \ - --port $ROUTER_PORT \ - --vllm-pd-disaggregation \ - $PREFILL_URLS \ - $DECODE_URLS \ - --policy round_robin \ - --prefill-policy round_robin \ - --decode-policy round_robin \ - --intra-node-data-parallel-size 1 \ - --retry-max-retries 3 \ - --health-check-endpoint /health \ - --prometheus-port 29000" + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" + echo "DRY RUN: $HEALTH_BARRIER_CMD" else - ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" - set -x - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - set +x - proxy_pid=$! - - HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-health \ - --health-endpoint /health \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -364,7 +349,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -418,7 +403,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 8e2276d1c..3af1b5b0e 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -131,28 +131,25 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. vllm-router (Rust-based proxy for PD disaggregation) +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. # Only needed on NODE_RANK=0 (proxy node). # --------------------------------------------------------------------------- -install_vllm_router() { - if pip show vllm-router &>/dev/null; then - echo "[SETUP] vllm-router already installed" +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" return 0 fi - echo "[SETUP] Installing Rust toolchain..." - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - export PATH="/root/.cargo/bin:${PATH}" - fi - - echo "[SETUP] Installing vllm-router via pip..." - pip install --quiet vllm-router + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + pip install --quiet --ignore-installed blinker + pip install --quiet quart aiohttp msgpack pyzmq - if ! pip show vllm-router &>/dev/null; then - echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 fi - _SETUP_INSTALLED+=("vllm-router") + _SETUP_INSTALLED+=("mori-proxy-deps") } # --------------------------------------------------------------------------- @@ -250,7 +247,7 @@ install_mori patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then - install_vllm_router + install_mori_proxy_deps fi # ============================================================================= From 159b571dd433a5b4f1b9fa352ce8de99fcf41398 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 08:47:54 +0000 Subject: [PATCH 09/31] [AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh Signed-off-by: Theresa Shan --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 3 ++- .../multi_node/vllm_disagg_utils/submit.sh | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 167aff5f3..172ecdf51 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -37,7 +37,8 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ + ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index d60ed87e6..f210d7ac7 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,18 +12,19 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] + [NODE_LIST] [RANDOM_RANGE_RATIO] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -66,6 +67,7 @@ OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} +RANDOM_RANGE_RATIO=${10} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,10 +87,10 @@ export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From d7e9e501cd99f1c843318f4abbe3be73408c1ef5 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 10:22:58 +0000 Subject: [PATCH 10/31] Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but still ran the first barrier; 2. kill and kill run only when DRY_RUN=0 Signed-off-by: Theresa Shan --- .../multi_node/vllm_disagg_utils/server.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index f81ff68e1..55538d4fa 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -284,11 +284,15 @@ if [ "$NODE_RANK" -eq 0 ]; then fi echo "Waiting for all prefill and decode servers to be up . . ." - python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports $SERVER_PORT \ - --wait-for-all-ports \ - --timeout 1800 + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi echo "Congratulations!!! All prefill and decode servers are up . . ." @@ -336,8 +340,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the proxy server and prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill_pid + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then From fe12e828555a340313b3c83bd3dae2c299f8e042 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 19 Mar 2026 18:33:36 +0000 Subject: [PATCH 11/31] [AMD] Fix vLLM disagg hang: READ mode support + safety timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable READ-mode KV transfer (decode-initiated RDMA reads) with a critical scheduler assertion fix, and add safety timeouts to prevent indefinite hangs during RDMA transfers. Changes: - setup_deps.sh: Add patches — save_kv_layer/start_load_kv handshake timeouts (30s), RDMA transfer timeout (120s), deferred write task expiry (60s), write worker error handling, and scheduler assertion fix for READ-mode intermediate request states - moriio_proxy.py: Add stream idle timeout (PROXY_STREAM_IDLE_TIMEOUT) to abort stalled decode streams, and proper response.release() - submit.sh, job.slurm: Plumb PROXY_STREAM_IDLE_TIMEOUT and VLLM_MORIIO_CONNECTOR_READ_MODE env vars into Docker containers Validated: 1k/1k full sweep (C8–C512), 100% success rate at all concurrency levels, peak 8500 output tok/s at C512. --- .../multi_node/vllm_disagg_utils/job.slurm | 2 + .../vllm_disagg_utils/moriio_proxy.py | 21 +- .../vllm_disagg_utils/setup_deps.sh | 468 +++++++++++++++++- .../multi_node/vllm_disagg_utils/submit.sh | 3 + 4 files changed, 489 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 3a71436fe..b216f53f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -304,6 +304,8 @@ exec sudo docker run --rm \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py index 82272dd52..b2162c98a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -18,6 +18,7 @@ import re import socket import threading +import time import uuid import aiohttp @@ -37,6 +38,8 @@ request_nums = 0 app = Quart(__name__) +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") TRANSFER_TYPE = None @@ -173,13 +176,27 @@ async def start_decode_request(endpoint, req_data, request_id): async def stream_decode_response(session, response, request_id): try: if response.status == 200: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break else: raise RuntimeError( f"Decode response status={response.status}" ) finally: + await response.release() await session.close() diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 3af1b5b0e..467e1bd5a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -16,6 +16,19 @@ RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" _SETUP_START=$(date +%s) _SETUP_INSTALLED=() +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + # --------------------------------------------------------------------------- # 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) # --------------------------------------------------------------------------- @@ -36,7 +49,7 @@ install_ucx() { ( set -e mkdir -p /usr/local/src && cd /usr/local/src - git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx git checkout da3fac2a ./autogen.sh && mkdir -p build && cd build ../configure \ @@ -74,7 +87,7 @@ install_rixl() { echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." ( set -e - git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl git checkout f33a5599 meson setup build --prefix="${RIXL_HOME}" \ -Ducx_path="${UCX_HOME}" \ @@ -171,7 +184,7 @@ install_mori() { echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." ( set -e - git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori git checkout b645fc8 pip install --quiet . ) @@ -235,6 +248,451 @@ else: _SETUP_INSTALLED+=("MoRI-FP8-patch") } +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM v0.17.1 asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + req = self.requests[req_id] + if RequestStatus.is_finished(req.status): + self._free_blocks(req) + else: + logger.debug( + "Request %s send finished but status=%s, " + "deferring block free to request completion", + req_id, req.status.name)""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + # ============================================================================= # Run installers # ============================================================================= @@ -245,6 +703,10 @@ install_etcd install_libionic install_mori patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index f210d7ac7..5d733b010 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -92,6 +92,9 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" From 3e05159a59457dba3d5675f7ec9f0fd5505b05c0 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 21 Mar 2026 19:15:33 +0000 Subject: [PATCH 12/31] Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs) Port the vLLM disaggregated serving pipeline from the 4N cluster (Pensando ionic NICs) to the 9N mia1 cluster (mlx5/rdma NICs). Key changes: - Fix C512 deadlock: apply ucx_error_handling_mode=none universally instead of only for ionic NICs. Under high concurrency, UCX's default UCP_ERR_HANDLING_MODE_PEER prevents RIXL RDMA READ retries from recovering after ibv_post_send queue exhaustion, causing prefill KV cache saturation and pipeline deadlock. - Force-reinstall MoRI from b645fc8 to fix PCI topology assertion failure on nodes with Broadcom PEX890xx PCIe switches. - Auto-detect Docker privilege (sudo vs non-sudo) for cross-cluster portability. - Add SLURM_EXCLUDE_NODES support to skip nodes with broken Docker sockets. - Increase VLLM_ENGINE_READY_TIMEOUT_S to 3600 to accommodate longer setup times (RIXL/MoRI source builds over NFS). --- .../multi_node/vllm_disagg_utils/job.slurm | 20 +++++++++---- .../multi_node/vllm_disagg_utils/models.yaml | 2 +- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++---------- .../vllm_disagg_utils/setup_deps.sh | 25 ++++++++++++---- .../multi_node/vllm_disagg_utils/submit.sh | 8 +++++ 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index b216f53f4..904aaaff4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -61,6 +61,16 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +# ============================================================================= +# Docker privilege detection +# ============================================================================= +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,7 +222,7 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) cleanup() { echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } @@ -240,10 +250,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true +$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true -exec sudo docker run --rm \ +exec $DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -320,4 +330,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 4a720785a..ef062e5f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -32,7 +32,7 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 55538d4fa..d21bdbebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -82,22 +82,21 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Only needed for Pensando ionic NICs which don't support rdmacm — the default - # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. - if [[ "${IBDEVICES:-}" == *ionic* ]]; then - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" - fi + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" fi - else - echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 467e1bd5a..a6b1f79cb 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -171,8 +171,18 @@ install_mori_proxy_deps() { # GPU kernels are JIT-compiled on first use; no hipcc needed at install. # --------------------------------------------------------------------------- install_mori() { - if python3 -c "import mori" 2>/dev/null; then - echo "[SETUP] MoRI Python bindings already present" + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + # The pre-installed MoRI in vllm base images has a PCI topology bug: it + # only maps the secondary bus of each bridge instead of the full + # secondary-to-subordinate range (dsp2dev). This causes an assertion + # failure in TopoSystemPci::Load() on nodes with deeply-nested PCIe + # switch topologies (e.g. Broadcom PEX890xx on MI355X mia1 nodes). + # Always rebuild from the target commit unless the marker file proves + # the correct version was already installed in this container. + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" return 0 fi @@ -181,19 +191,22 @@ install_mori() { libopenmpi-dev openmpi-bin libpci-dev \ && rm -rf /var/lib/apt/lists/* - echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding pre-installed version to fix PCI topology bug)" ( set -e git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori - git checkout b645fc8 - pip install --quiet . + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . ) rm -rf /opt/mori if ! python3 -c "import mori" 2>/dev/null; then echo "[SETUP] ERROR: MoRI build failed"; exit 1 fi - _SETUP_INSTALLED+=("MoRI") + # Drop a marker so re-entry doesn't rebuild + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } # --------------------------------------------------------------------------- diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 5d733b010..c5404ec18 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -112,6 +112,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -119,6 +126,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" From 4d1a315f0f5bea2195b4c0e7f3e8b3a98d219fb6 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 12:38:46 +0000 Subject: [PATCH 13/31] [AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side: RIXL can lose `finished_sending` notifications under high concurrency with ibv_post_send failures, permanently leaking prefill KV blocks. Over multiple benchmark rounds (sweep), leaked blocks accumulate and saturate the prefill KV cache, deadlocking C512. - Fix finished_sending handler to unconditionally free KV blocks (the conditional status check had no recovery path, causing leaks) - Add idle KV block reaper: detects engine idle >5s with finished requests still holding blocks, then force-frees them - Add 10s cooldown between benchmark rounds for reaper activation Client-side: SSE streaming loop did not break on the [DONE] sentinel, causing the benchmark client to hang when the proxy held connections open after request completion. - Break SSE loop on [DONE] in completions and chat completions - Share a single aiohttp.ClientSession across all requests (connection pooling via TCPConnector instead of per-request session creation) - Add asyncio.wait_for timeout around asyncio.gather with proper task cancellation and partial result collection - Reduce AIOHTTP_TIMEOUT from 6h to 30min Verified: sweep 1K/1K C128→C256→C512 all pass (Job 6222, 9N cluster). --- .../multi_node/vllm_disagg_utils/bench.sh | 2 + .../vllm_disagg_utils/setup_deps.sh | 123 ++++++++++++- utils/bench_serving/backend_request_func.py | 170 +++++++++++------- utils/bench_serving/benchmark_serving.py | 57 ++++-- 4 files changed, 263 insertions(+), 89 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 37b9d0b56..5b9f5c772 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -70,4 +70,6 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do --result-dir /workspace/ echo "-----------------------------------------" + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 done diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a6b1f79cb..a95591cb5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -683,14 +683,7 @@ try: if req_id not in self.requests: logger.debug("Request %s already removed, skipping send", req_id) continue - req = self.requests[req_id] - if RequestStatus.is_finished(req.status): - self._free_blocks(req) - else: - logger.debug( - "Request %s send finished but status=%s, " - "deferring block free to request completion", - req_id, req.status.name)""" + self._free_blocks(self.requests[req_id])""" if old_send in new_src: new_src = new_src.replace(old_send, new_send, 1) @@ -706,6 +699,119 @@ except Exception as e: _SETUP_INSTALLED+=("scheduler-read-mode-fix") } +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _num_waiting = sum(1 for r in self.requests.values() + if r.status == RequestStatus.WAITING) + _is_idle = (_num_running == 0 and _num_waiting == 0) + + if _is_idle: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + # ============================================================================= # Run installers # ============================================================================= @@ -720,6 +826,7 @@ patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index af030720e..89830ccbc 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -14,7 +14,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60) @dataclass @@ -49,12 +49,16 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -62,7 +66,6 @@ async def async_request_tgi( "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, - # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -113,21 +116,28 @@ async def async_request_tgi( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_trt_llm( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -181,18 +191,25 @@ async def async_request_trt_llm( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { @@ -225,23 +242,30 @@ async def async_request_deepspeed_mii( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_openai_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -281,33 +305,35 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + if chunk == "[DONE]": + break + + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -324,6 +350,9 @@ async def async_request_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) @@ -333,14 +362,18 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -387,28 +420,30 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + if chunk == "[DONE]": + break + + timestamp = time.perf_counter() + data = json.loads(chunk) - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") - most_recent_timestamp = timestamp + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -420,6 +455,9 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 647165da9..b63a0427e 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -39,9 +39,10 @@ from multiprocessing import Pool, cpu_count from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +import aiohttp import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS, + RequestFuncInput, RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -470,11 +471,14 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True) + shared_session = aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector) + print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0]) if backend != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( @@ -493,11 +497,13 @@ async def benchmark( if num_warmups > 0: print(f"Warming up with {num_warmups} requests...") warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) - warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext() + warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups) async def warmup_limited_req_fn(): async with warmup_semaphore: - return await request_func(request_func_input=test_input, pbar=warmup_pbar) + return await request_func( + request_func_input=test_input, pbar=warmup_pbar, + session=shared_session) warmup_tasks = [] for _ in range(num_warmups): @@ -510,7 +516,6 @@ async def warmup_limited_req_fn(): print("Warmup completed.") if lora_modules: - # For each input request, choose a LoRA module at random. lora_modules = iter( [random.choice(lora_modules) for _ in range(len(input_requests))]) @@ -527,7 +532,8 @@ async def warmup_limited_req_fn(): best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler started") @@ -542,20 +548,16 @@ async def warmup_limited_req_fn(): pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) semaphore = (asyncio.Semaphore(max_concurrency) if max_concurrency else None) async def limited_request_func(request_func_input, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) async with semaphore: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) print("Starting main benchmark run...") @@ -582,7 +584,28 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + gather_timeout = max(7200, len(input_requests) * 30) + try: + outputs: List[RequestFuncOutput] = await asyncio.wait_for( + asyncio.gather(*tasks), timeout=gather_timeout) + except asyncio.TimeoutError: + completed = pbar.n if pbar else "?" + print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s " + f"({completed}/{len(tasks)} requests completed). " + "Collecting partial results...") + for task in tasks: + if not task.done(): + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + outputs = [] + for task in tasks: + if task.done() and not task.cancelled(): + try: + outputs.append(task.result()) + except Exception: + outputs.append(RequestFuncOutput()) + else: + outputs.append(RequestFuncOutput()) if profile: print("Stopping profiler...") @@ -595,10 +618,14 @@ async def limited_request_func(request_func_input, pbar): logprobs=logprobs, best_of=best_of, ) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler stopped") + await shared_session.close() + await connector.close() + if pbar is not None: pbar.close() From 0006d60f66212d0a5df84bc3d6b087541593344f Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 18:21:22 +0000 Subject: [PATCH 14/31] [AMD] Fix vLLM disagg Slurm job never terminating after benchmark completion Background processes (proxy, prefill, decode, etcd) were started via `cmd 2>&1 | tee logfile &`, causing bash $! to capture the PID of tee rather than the actual process. `kill $pid` only killed tee, leaving the real process running. The proxy kept port 30000 open, so decode nodes' `sync.py wait` never detected shutdown and the Slurm job hung forever. Additionally, etcd's stderr was not redirected, holding the Docker container's main pipe open and preventing container exit even after server.sh completed. Changes: - Redirect all background processes to log files instead of piping through tee, so $! captures the correct PID (matches SGLang pattern) - Redirect etcd launcher's stderr to prevent pipe leak - Add pkill fallback cleanup for proxy, vllm, and etcd processes - Increase barrier grace period to handle node setup time variance - Increase container creation barrier timeout from 300s to 600s --- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++++-------- .../multi_node/vllm_disagg_utils/sync.py | 5 +++- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d21bdbebb..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -162,14 +162,14 @@ python3 $VLLM_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 5000 \ --wait-for-all-ports \ - --timeout 300 + --timeout 600 # ============================================================================= # ETCD Server Setup # ============================================================================= echo "Proceeding to start etcd server on $host_name" -bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null 2>&1 & etcd_pid=$! echo "Waiting at etcd server barrier on $host_name" @@ -260,7 +260,7 @@ if [ "$NODE_RANK" -eq 0 ]; then else PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" set -x - eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & set +x proxy_pid=$! sleep 3 @@ -275,9 +275,9 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -341,6 +341,10 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 0 ]]; then [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then @@ -358,9 +362,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -390,7 +394,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then fi echo "Killing the prefill server" - [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" @@ -412,9 +416,9 @@ else if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $DECODE_CMD" else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & set +x decode_pid=$! fi @@ -444,11 +448,12 @@ else fi echo "Killing the decode server" - [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi echo "Killing the etcd server" -kill $etcd_pid +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/sync.py +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() From a00214360d4a4b4093dc5e6f94738a4991756b39 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:44:27 +0000 Subject: [PATCH 15/31] [AMD] Enable MoRI-IO READ mode by default for vLLM disagg --- .github/configs/amd-master.yaml | 3 +++ benchmarks/multi_node/vllm_disagg_utils/job.slurm | 2 +- benchmarks/multi_node/vllm_disagg_utils/submit.sh | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index daa3c2806..e91e00f2a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1017,6 +1017,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1037,6 +1038,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1057,6 +1059,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 904aaaff4..c555f6948 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -315,7 +315,7 @@ exec $DOCKER_CMD run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index c5404ec18..7063aa7a8 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -93,7 +93,7 @@ export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} -export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From eba7f66828874f6589eea69e68fbc0235474d65a Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:57:24 +0000 Subject: [PATCH 16/31] [AMD] Fix CI checkout failure caused by root-owned __pycache__ files Fix per-node Docker privilege detection in vLLM disagg job.slurm --- .../multi_node/vllm_disagg_utils/job.slurm | 18 ++++++++++++++---- .../multi_node/vllm_disagg_utils/server.sh | 3 +++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index c555f6948..d33525081 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -64,6 +64,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Docker privilege detection # ============================================================================= +# Detect on the batch host (used for post-srun cleanup). +# Per-node detection happens inside the srun inline script below because +# some nodes may require sudo while others do not. if docker ps &>/dev/null; then DOCKER_CMD="docker" else @@ -249,11 +252,18 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node Docker privilege detection (some nodes need sudo, others don't) +if docker ps &>/dev/null; then + _DCMD=docker +else + _DCMD='sudo docker' +fi + # Pre-clean (idempotent) -$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true -$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true +\$_DCMD ps -aq --filter \"name=^container_vllm_\" | xargs -r \$_DCMD rm -f || true +\$_DCMD ps -aq | xargs -r \$_DCMD stop || true -exec $DOCKER_CMD run --rm \ +exec \$_DCMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -330,4 +340,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'if docker ps &>/dev/null; then D=docker; else D="sudo docker"; fi; $D rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..85a50b38d 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,5 +455,8 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true +# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout +find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true + echo "Script completed successfully" exit 0 From 8c01e383c152bd1c13bca92ade6a88fa4ae70276 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 09:07:02 +0000 Subject: [PATCH 17/31] [AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS Docker containers run as root, so __pycache__/*.pyc files created during benchmark_serving.py import end up root-owned on the NFS workspace. The CI runner cannot delete them, breaking checkout. Set PYTHONPYCACHEPREFIX=/tmp/pycache in the Docker env so bytecache stays inside the container. Remove the previous server.sh find-and- delete workaround since the root cause is now addressed. --- benchmarks/multi_node/vllm_disagg_utils/job.slurm | 1 + benchmarks/multi_node/vllm_disagg_utils/server.sh | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index d33525081..bc04f3b61 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -326,6 +326,7 @@ exec \$_DCMD run --rm \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 85a50b38d..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,8 +455,5 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true -# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout -find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true - echo "Script completed successfully" exit 0 From 7f033620216628aeaa495183ace77381d0d1d274 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 16:28:18 +0000 Subject: [PATCH 18/31] [AMD] Fix KV reaper deadlock on high-ISL disagg workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idle KV block reaper only fired when both running=0 AND waiting=0. Under 8K ISL at C64+, leaked blocks filled the prefill KV cache while new requests queued in WAITING state — the non-empty wait queue prevented the reaper from ever triggering, causing a permanent hang. Remove the waiting-queue check so the reaper fires whenever no requests are actively running, which is precisely when leaked blocks can be safely reclaimed. Verified with 8K/1K sweep (C32–C512) completing without hangs. --- benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a95591cb5..e8437a5c9 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -767,11 +767,9 @@ try: _REAPER_IDLE_SECS = 5.0 _num_running = sum(1 for r in self.requests.values() if r.status == RequestStatus.RUNNING) - _num_waiting = sum(1 for r in self.requests.values() - if r.status == RequestStatus.WAITING) - _is_idle = (_num_running == 0 and _num_waiting == 0) + _should_reap = (_num_running == 0) - if _is_idle: + if _should_reap: if not self._idle_kv_reaper_active: self._idle_kv_reaper_active = True self._idle_kv_reaper_ts = _time.monotonic() From 5fedd82d7333e711252e4a60532c28944fdad796 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 24 Mar 2026 08:35:21 +0000 Subject: [PATCH 19/31] [AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,DECODE_EP,DECODE_DP_ATTN from amd-master.yaml config. Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 6 +-- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 39 +++++++++++++-- .../multi_node/vllm_disagg_utils/job.slurm | 14 ++++++ .../multi_node/vllm_disagg_utils/server.sh | 31 ++++++++++++ .../multi_node/vllm_disagg_utils/submit.sh | 50 +++++++++++++------ 5 files changed, 119 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e91e00f2a..55ca02841 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1021,7 +1021,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1042,7 +1042,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1063,7 +1063,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 172ecdf51..b21e9204a 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -11,8 +11,12 @@ check_env_vars \ MODEL_PATH \ PREFILL_NUM_WORKERS \ PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ DECODE_NUM_WORKERS \ DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ RANDOM_RANGE_RATIO @@ -30,15 +34,42 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. -# NODELIST (optional) constrains which Slurm nodes are used. +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ - ${RANDOM_RANGE_RATIO}) + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index bc04f3b61..e1cad0817 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -217,6 +217,14 @@ export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +# TP / EP / DP (from vllm_disagg_utils/submit.sh; mirrors amd_utils disagg) +export PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +export PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +export DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +export DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +export PREFILL_TP="${PREFILL_TP:-8}" +export DECODE_TP="${DECODE_TP:-8}" + SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" @@ -327,6 +335,12 @@ exec \$_DCMD run --rm \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ -e PYTHONPYCACHEPREFIX=/tmp/pycache \ + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ + -e PREFILL_TP=\$PREFILL_TP \ + -e DECODE_TP=\$DECODE_TP \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..9b0ff2ebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -150,6 +150,37 @@ print(f'DECODE_MODEL_ENVS=\"{dev}\"') echo "Loaded model configuration for: $MODEL_NAME" +# Apply tensor-parallel size and EP/DP flags from submit pipeline (YAML PREFILL_TP / dp-attn / ep). +if [[ -n "${PREFILL_TP:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP}" + fi +fi +if [[ -n "${DECODE_TP:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + # ============================================================================= # Container Synchronization # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 7063aa7a8..ecb5a9876 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,19 +12,29 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] [RANDOM_RANGE_RATIO] + \ + \ + \ + \ + [NODE_LIST] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames - RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false (from PREFILL_EP in YAML; false when EP==1) + PREFILL_ENABLE_DP true/false (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false (from DECODE_EP in YAML) + DECODE_ENABLE_DP true/false (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -57,7 +67,7 @@ check_env RUNNER_NAME GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -# COMMAND_LINE ARGS +# COMMAND_LINE ARGS (aligned with benchmarks/multi_node/amd_utils/submit.sh) PREFILL_NODES=$1 PREFILL_WORKERS=${2:-1} DECODE_NODES=$3 @@ -66,8 +76,14 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -NODE_LIST=${9} -RANDOM_RANGE_RATIO=${10} +PREFILL_ENABLE_EP=${9:-false} +PREFILL_ENABLE_DP=${10:-false} +DECODE_ENABLE_EP=${11:-false} +DECODE_ENABLE_DP=${12:-false} +PREFILL_TP=${13:-8} +DECODE_TP=${14:-8} +RANDOM_RANGE_RATIO=${15:-0.8} +NODE_LIST=${16} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,6 +101,12 @@ export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME +export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} +export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} +export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} +export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} +export PREFILL_TP=${PREFILL_TP} +export DECODE_TP=${DECODE_TP} export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} From 708570b7360deb788f92efbee5a9ce1688561d81 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 29 Mar 2026 17:13:53 +0000 Subject: [PATCH 20/31] [AMD] Upgrade vLLM disagg image from v0.17.1 to v0.18.0 Bump vllm/vllm-openai-rocm to v0.18.0 for the dsr1-fp8-mi355x-vllm-disagg config. Changes required by the new image: - setup_deps.sh: drop aiohttp/pyzmq installs (now pre-installed in v0.18.0); move install_mori_proxy_deps before patches and run on all nodes so msgpack is available when patch scripts import MoRI-IO connector modules - moriio_proxy.py: populate transfer_id in kv_transfer_params dicts (new required field in v0.18.0's moriio_connector.update_state_after_alloc) - MoRI PCI topology bug persists in v0.18.0; rebuild from b645fc8 retained Tested: 1K1K C8,16,32,64,128,256 on mia1 3-node (1P+2D) CONC512 is ongoing but it seems good so far --- .github/configs/amd-master.yaml | 2 +- .../vllm_disagg_utils/moriio_proxy.py | 5 +-- .../vllm_disagg_utils/setup_deps.sh | 34 +++++++++---------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 55ca02841..68400c158 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -995,7 +995,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.17.1 + image: vllm/vllm-openai-rocm:v0.18.0 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py index b2162c98a..7d1e8454b 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -244,8 +244,8 @@ def extract_ip_port_fast(url): dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {} - req_data["kv_transfer_params"] = {} + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( decode_instance_endpoint["dp_size"] ) @@ -269,6 +269,7 @@ def extract_ip_port_fast(url): req_data["max_tokens"] -= 1 req_data["kv_transfer_params"] = { + "transfer_id": request_id, "do_remote_decode": False, "do_remote_prefill": True, "remote_handshake_port": prefill_instance_endpoint["handshake_port"], diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index e8437a5c9..42aa648b0 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -2,7 +2,7 @@ # ============================================================================= # setup_deps.sh — Install missing vLLM disagg dependencies at container start. # -# Base image: vllm/vllm-openai-rocm:v0.17.1 +# Base image: vllm/vllm-openai-rocm:v0.18.0 # Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. # Idempotent: each component is skipped if already present. # @@ -156,8 +156,11 @@ install_mori_proxy_deps() { fi echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. pip install --quiet --ignore-installed blinker - pip install --quiet quart aiohttp msgpack pyzmq + pip install --quiet quart msgpack if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 @@ -169,18 +172,16 @@ install_mori_proxy_deps() { # 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) # Required for --all2all-backend mori (Expert Parallelism via RDMA). # GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. # --------------------------------------------------------------------------- install_mori() { local MORI_TARGET_COMMIT="b645fc8" local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" - # The pre-installed MoRI in vllm base images has a PCI topology bug: it - # only maps the secondary bus of each bridge instead of the full - # secondary-to-subordinate range (dsp2dev). This causes an assertion - # failure in TopoSystemPci::Load() on nodes with deeply-nested PCIe - # switch topologies (e.g. Broadcom PEX890xx on MI355X mia1 nodes). - # Always rebuild from the target commit unless the marker file proves - # the correct version was already installed in this container. if ls $MORI_MARKER &>/dev/null; then echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" return 0 @@ -192,7 +193,7 @@ install_mori() { && rm -rf /var/lib/apt/lists/* echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." - echo "[SETUP] (overriding pre-installed version to fix PCI topology bug)" + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" ( set -e git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori @@ -204,14 +205,13 @@ install_mori() { if ! python3 -c "import mori" 2>/dev/null; then echo "[SETUP] ERROR: MoRI build failed"; exit 1 fi - # Drop a marker so re-entry doesn't rebuild touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } # --------------------------------------------------------------------------- -# 7. Patch vLLM v0.17.1 MoRI-EP + FP8 incompatibility -# v0.17.1 asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel # uses defer_input_quant=True which MoRI's prepare/finalize rejects. # Patch: remove both the AITER requirement assertion and the # defer_input_quant NotImplementedError so non-AITER kernels work. @@ -621,10 +621,11 @@ except Exception as e: # --------------------------------------------------------------------------- # 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished -# vLLM v0.17.1 asserts that a request in finished_recving must be either +# vLLM asserts that a request in finished_recving must be either # WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can # transition to RUNNING before the aggregated recv notification arrives, # crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) # --------------------------------------------------------------------------- patch_scheduler_read_mode_fix() { python3 -c ' @@ -819,6 +820,7 @@ install_rixl install_etcd install_libionic install_mori +install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout @@ -826,10 +828,6 @@ patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper -if [[ "${NODE_RANK:-0}" -eq 0 ]]; then - install_mori_proxy_deps -fi - # ============================================================================= # Export paths (persists for server.sh since this file is sourced) # ============================================================================= From 96154d2124becb15cf51fcd00790957cdccd5b40 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 30 Mar 2026 08:27:13 +0000 Subject: [PATCH 21/31] [AMD] Add Kimi-K2.5-MXFP4 disagg inference config (1P2D) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable vLLM disagg serving for amd/Kimi-K2.5-MXFP4 on MI355X with a 1P2D node topology (TP=8, decode EP=8). Changes: - amd-master.yaml: add kimik2.5-fp4-mi355x-vllm-disagg config with three seq-len scenarios (1K1K, 8K1K), READ mode enabled - models.yaml: add Kimi-K2.5-MXFP4 server flags (PIECEWISE cudagraph, --gpu-memory-utilization 0.90, --mm-encoder-tp-mode data) - bench.sh: add --trust-remote-code for models with custom code - setup_deps.sh: install amd-quark for MXFP4 quantization support - Add kimik2.5_fp4_mi355x_vllm-disagg.sh entry script Verified with full 1K/1K sweep (CONC 8–512) on SA4N and mia1 9N cluster; all concurrency levels completed without hang. --- .github/configs/amd-master.yaml | 33 +++++++- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 79 +++++++++++++++++++ .../multi_node/vllm_disagg_utils/bench.sh | 3 +- .../multi_node/vllm_disagg_utils/models.yaml | 6 ++ .../vllm_disagg_utils/setup_deps.sh | 22 ++++++ 5 files changed, 141 insertions(+), 2 deletions(-) create mode 100755 benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 68400c158..484e4322c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1047,9 +1047,20 @@ dsr1-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: - isl: 1024 - osl: 8192 + osl: 1024 search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: @@ -1068,6 +1079,26 @@ dsr1-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..b21e9204a --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 5b9f5c772..274c5954e 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -67,7 +67,8 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --trust-remote-code echo "-----------------------------------------" echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index ef062e5f4..0ef2bc77f 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -35,6 +35,12 @@ DeepSeek-R1-0528: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + gpt-oss-120b: prefill_flags: "--tensor-parallel-size 8" decode_flags: "--tensor-parallel-size 8" diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 42aa648b0..848bd6918 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -209,6 +209,27 @@ install_mori() { _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + # --------------------------------------------------------------------------- # 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) # vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel @@ -820,6 +841,7 @@ install_rixl install_etcd install_libionic install_mori +install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout From 25d1f596b1bcb96d4daebddc56c0c702297ffb51 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:08:57 +0000 Subject: [PATCH 22/31] feat: add MiniMax M2.5 PD disaggregation recipe (1P2D, MoRI-EP + MoRI-IO) Cherry-picked from ChuanLi1101/InferenceMAX:chuali/minimax-m25-vllm-disagg (commit 72a0002e). Resolved conflict in models.yaml to keep both Kimi-K2.5-MXFP4 and MiniMax-M2.5 entries. Add multi-node vLLM PD disaggregation support for MiniMax-M2.5 (FP8), following the DeepSeek R1 disagg recipe pattern. Includes: - models.yaml: MiniMax-M2.5 config with TP8 prefill / TP8+EP8+MoRI decode - Entry script: minimaxm25_fp8_mi355x_vllm-disagg.sh - amd-master.yaml: e2e test entry for 1P2D on MI355X (1k1k, 8k1k, 1k8k) MiniMax M2.5 (230B, 256 experts, top-8 sigmoid routing, GQA) uses the same disagg infrastructure as DSR1. Unlike DeepSeek MLA models, M2.5 uses standard GQA attention so AITER paged attention is fully supported and no block-size/cudagraph workarounds are needed. Co-authored-by: ChuanLi1101 Co-authored-by: Claude Made-with: Cursor --- .github/configs/amd-master.yaml | 75 ++++++++++++++++++ .../minimaxm25_fp8_mi355x_vllm-disagg.sh | 77 +++++++++++++++++++ .../multi_node/vllm_disagg_utils/models.yaml | 6 ++ 3 files changed, 158 insertions(+) create mode 100644 benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 484e4322c..a261a0ca7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1100,6 +1100,81 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +minimaxm25-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm25 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 1024 + osl: 8192 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 diff --git a/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..137ee0381 --- /dev/null +++ b/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 0ef2bc77f..3e62972b8 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -41,6 +41,12 @@ Kimi-K2.5-MXFP4: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--amd--Kimi-K2.5-MXFP4" +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + gpt-oss-120b: prefill_flags: "--tensor-parallel-size 8" decode_flags: "--tensor-parallel-size 8" From ac24450950c0188e5cedfaec6a55e540d315e3ac Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:09:47 +0000 Subject: [PATCH 23/31] feat: add Dockerfile and runtime patch for MiniMax M2.5 WideEP + MoRI Cherry-picked from ChuanLi1101/InferenceMAX:chuali/minimax-m25-vllm-disagg (commit bb6bd0ed). Adapted for v0.18.0 base: kept vllm/vllm-openai-rocm:v0.18.0 image (runtime patch via setup_deps.sh is sufficient; custom Docker image available in docker/minimax-m25-disagg/ if needed). Two deployment options for getting vLLM minimax_m2.py changes into the container: Option A -- Custom Docker image (docker/minimax-m25-disagg/): Builds from the public vLLM ROCm image and pre-installs UCX, etcd, RIXL, and patched minimax_m2.py with WideEP + MoRI + EPLB support baked in. Option B -- Runtime patch (setup_deps.sh): patch_minimax_m2_wideep_mori() copies patched minimax_m2.py from the mounted InferenceX repo into the container's vLLM installation at startup. Co-authored-by: ChuanLi1101 Co-authored-by: Claude Made-with: Cursor --- .../vllm_disagg_utils/patches/minimax_m2.py | 672 ++++++++++++++++++ .../vllm_disagg_utils/setup_deps.sh | 40 ++ docker/minimax-m25-disagg/Dockerfile | 91 +++ docker/minimax-m25-disagg/build.sh | 31 + .../minimax-m25-disagg/patches/minimax_m2.py | 672 ++++++++++++++++++ 5 files changed, 1506 insertions(+) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py create mode 100644 docker/minimax-m25-disagg/Dockerfile create mode 100644 docker/minimax-m25-disagg/build.sh create mode 100644 docker/minimax-m25-disagg/patches/minimax_m2.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py new file mode 100644 index 000000000..c27b77ccf --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + params_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 848bd6918..7f691d141 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -832,6 +832,45 @@ except Exception as e: _SETUP_INSTALLED+=("idle-kv-reaper") } +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + # ============================================================================= # Run installers # ============================================================================= @@ -849,6 +888,7 @@ patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori # ============================================================================= # Export paths (persists for server.sh since this file is sourced) diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile new file mode 100644 index 000000000..3bced3f91 --- /dev/null +++ b/docker/minimax-m25-disagg/Dockerfile @@ -0,0 +1,91 @@ +# MiniMax M2.5 PD Disaggregation Docker Image +# +# Extends the public vLLM ROCm image with: +# 1. WideEP + MoRI support for MiniMax M2.5 (minimax_m2.py patch) +# 2. Pre-installed runtime deps (UCX, RIXL, etcd, MoRI) +# 3. Disagg orchestration scripts baked in +# +# Build: +# docker build -t minimax-m25-disagg:latest -f docker/minimax-m25-disagg/Dockerfile . +# +# The image still sources setup_deps.sh at startup for idempotent patching +# (scheduler KV reaper, MoRI-IO read mode, etc.) but the heavy build steps +# (UCX, RIXL) are cached in the image layer. + +ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0 +FROM ${BASE_IMAGE} + +ARG ROCM_PATH=/opt/rocm +ARG UCX_HOME=/usr/local/ucx +ARG RIXL_HOME=/usr/local/rixl + +# ---------------------------------------------------------------- +# 1. Patch vLLM: MiniMax M2.5 WideEP + MoRI + EPLB support +# ---------------------------------------------------------------- +COPY docker/minimax-m25-disagg/patches/minimax_m2.py /tmp/patches/minimax_m2.py +RUN VLLM_MODELS=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \ + cp /tmp/patches/minimax_m2.py "${VLLM_MODELS}/minimax_m2.py" && \ + echo "[DOCKER] Patched minimax_m2.py -> ${VLLM_MODELS}/minimax_m2.py" && \ + rm -rf /tmp/patches + +# ---------------------------------------------------------------- +# 2. Pre-install UCX build deps (speeds up setup_deps.sh at runtime) +# ---------------------------------------------------------------- +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + +# ---------------------------------------------------------------- +# 3. Pre-build UCX (ROCm fork) — the longest step in setup_deps.sh +# ---------------------------------------------------------------- +RUN git clone --quiet https://github.com/ROCm/ucx.git /usr/local/src/ucx && \ + cd /usr/local/src/ucx && \ + git checkout da3fac2a && \ + ./autogen.sh && mkdir -p build && cd build && \ + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm && \ + make -j"$(nproc)" && make install && \ + rm -rf /usr/local/src/ucx + +# ---------------------------------------------------------------- +# 4. Pre-install etcd +# ---------------------------------------------------------------- +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "x86_64" ]; then ETCD_ARCH=amd64; else ETCD_ARCH=arm64; fi && \ + ETCD_VER=v3.5.21 && \ + curl -fsSL "https://github.com/etcd-io/etcd/releases/download/${ETCD_VER}/etcd-${ETCD_VER}-linux-${ETCD_ARCH}.tar.gz" | \ + tar xz -C /usr/local/bin --strip-components=1 \ + "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcd" \ + "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcdctl" && \ + etcd --version + +# ---------------------------------------------------------------- +# 5. Pre-install RIXL (Nixl KV transfer) +# ---------------------------------------------------------------- +RUN pip install --no-cache-dir nixl && \ + python3 -c "import nixl; print('RIXL installed:', nixl.__file__)" || \ + echo "[DOCKER] WARN: nixl pip install failed, will fallback to setup_deps.sh" + +# ---------------------------------------------------------------- +# 6. Copy disagg orchestration scripts into the image +# ---------------------------------------------------------------- +COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ +COPY benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ + +# ---------------------------------------------------------------- +# 7. Environment +# ---------------------------------------------------------------- +ENV UCX_HOME=${UCX_HOME} \ + RIXL_HOME=${RIXL_HOME} \ + ROCM_PATH=${ROCM_PATH} \ + PATH="${UCX_HOME}/bin:/usr/local/bin:${PATH}" \ + LD_LIBRARY_PATH="${UCX_HOME}/lib:${LD_LIBRARY_PATH:-}" \ + PYTHONPYCACHEPREFIX=/tmp/pycache + +WORKDIR /workspace diff --git a/docker/minimax-m25-disagg/build.sh b/docker/minimax-m25-disagg/build.sh new file mode 100644 index 000000000..b36227caf --- /dev/null +++ b/docker/minimax-m25-disagg/build.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Build the MiniMax M2.5 PD Disagg Docker image. +# +# Usage: +# cd +# bash docker/minimax-m25-disagg/build.sh [tag] [base_image] +# +# Examples: +# bash docker/minimax-m25-disagg/build.sh # default tag + base +# bash docker/minimax-m25-disagg/build.sh my-tag:v1 # custom tag +# bash docker/minimax-m25-disagg/build.sh latest vllm/vllm-openai-rocm:v0.19.0 # custom base +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +TAG="${1:-minimax-m25-disagg:latest}" +BASE_IMAGE="${2:-vllm/vllm-openai-rocm:v0.18.0}" + +echo "Building MiniMax M2.5 Disagg image..." +echo " Tag: $TAG" +echo " Base image: $BASE_IMAGE" +echo " Context: $REPO_ROOT" + +docker build \ + -t "$TAG" \ + --build-arg BASE_IMAGE="$BASE_IMAGE" \ + -f "$REPO_ROOT/docker/minimax-m25-disagg/Dockerfile" \ + "$REPO_ROOT" + +echo "" +echo "Done. Image: $TAG" +echo "To push: docker tag $TAG /$TAG && docker push /$TAG" diff --git a/docker/minimax-m25-disagg/patches/minimax_m2.py b/docker/minimax-m25-disagg/patches/minimax_m2.py new file mode 100644 index 000000000..c27b77ccf --- /dev/null +++ b/docker/minimax-m25-disagg/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + params_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None From e553a8c9dd0e5452703497b962d05caa575a3bad Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 15:21:45 +0000 Subject: [PATCH 24/31] Fix: rename minimaxm25 to minimaxm2.5 for CI naming consistency Align MiniMax M2.5 disagg naming with existing single-node configs (minimaxm2.5_fp8_mi355x.sh, minimaxm2.5_fp8_mi300x.sh, etc.). - amd-master.yaml: minimaxm25 -> minimaxm2.5 in config key + model-prefix - Rename entry script: minimaxm25_fp8_mi355x_vllm-disagg.sh -> minimaxm2.5_fp8_mi355x_vllm-disagg.sh - Dockerfile: update COPY path to match renamed script --- .github/configs/amd-master.yaml | 6 ++---- ...vllm-disagg.sh => minimaxm2.5_fp8_mi355x_vllm-disagg.sh} | 0 docker/minimax-m25-disagg/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) rename benchmarks/multi_node/{minimaxm25_fp8_mi355x_vllm-disagg.sh => minimaxm2.5_fp8_mi355x_vllm-disagg.sh} (100%) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a261a0ca7..6956d162f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -838,7 +838,6 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - dsr1-fp8-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 @@ -993,7 +992,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - dsr1-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:v0.18.0 model: deepseek-ai/DeepSeek-R1-0528 @@ -1100,10 +1098,10 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" -minimaxm25-fp8-mi355x-vllm-disagg: +minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm25 + model-prefix: minimaxm2.5 runner: mi355x-disagg precision: fp8 framework: vllm-disagg diff --git a/benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh similarity index 100% rename from benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh rename to benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile index 3bced3f91..88e9ce764 100644 --- a/docker/minimax-m25-disagg/Dockerfile +++ b/docker/minimax-m25-disagg/Dockerfile @@ -76,7 +76,7 @@ RUN pip install --no-cache-dir nixl && \ # 6. Copy disagg orchestration scripts into the image # ---------------------------------------------------------------- COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ -COPY benchmarks/multi_node/minimaxm25_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ +COPY benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ # ---------------------------------------------------------------- # 7. Environment From ebaabd2b1c5135eb94816c75824d3a3780a9fd72 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 16:07:42 +0000 Subject: [PATCH 25/31] Optimize: add --gpu-memory-utilization 0.95 and --block-size 32 to MiniMax M2.5 disagg Align MiniMax M2.5 disagg serve parameters with the proven single-node config (minimaxm2.5_fp8_mi355x.sh). MiniMax M2.5 uses GQA (not MLA), so block-size 32 is optimal (vs block-size 1 for DeepSeek/Kimi MLA). The extra 5% GPU memory (0.95 vs default 0.9) increases KV cache capacity for high-concurrency sweeps (C256/C512). --- benchmarks/multi_node/vllm_disagg_utils/models.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 3e62972b8..0b4629b13 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -42,8 +42,8 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching" + prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" From 4d82c0f6220046508e275c5f52ebd3becc27ca49 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 18:17:08 +0000 Subject: [PATCH 26/31] =?UTF-8?q?Fix:=20MiniMax=20M2.5=20disagg=20?= =?UTF-8?q?=E2=80=94=20require=20EP=3D8=20for=20prefill,=20fix=20ROCm=20ga?= =?UTF-8?q?te=20dtype?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MiniMax M2.5 has expert intermediate_size=1536; with TP=8 and no EP the sharded dimension (192) is not divisible by FP8 block_n=128, crashing the prefill node. Set prefill EP=8 (matching decode and single-node) and add --enable-expert-parallel --all2all-backend mori to prefill_flags. Fix GateLinear to use out_dtype=torch.float32 instead of params_dtype=torch.float32 so the GEMM runs in bf16 (ROCm compatible) and only the output is cast to fp32 for routing precision. Remove the 1K/8K benchmark scenario (not needed). --- .github/configs/amd-master.yaml | 26 +++---------------- .../multi_node/vllm_disagg_utils/models.yaml | 2 +- .../vllm_disagg_utils/patches/minimax_m2.py | 2 +- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6956d162f..111b505bf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1112,12 +1112,14 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: osl: 1024 search-space: # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1138,30 +1140,9 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: prefill: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=2" - - - isl: 1024 - osl: 8192 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: @@ -1172,7 +1153,6 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" - dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 0b4629b13..c6d27b5ae 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -42,7 +42,7 @@ Kimi-K2.5-MXFP4: hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: - prefill_flags: "--tensor-parallel-size 8 --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py index c27b77ccf..8290276fb 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py @@ -128,7 +128,7 @@ def __init__( self.gate = GateLinear( config.hidden_size, config.num_local_experts, - params_dtype=torch.float32, + out_dtype=torch.float32, prefix=f"{prefix}.gate", ) From e1633126079496f7f2bd0f46a2782cffcfd0a720 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 3 Apr 2026 19:28:13 +0000 Subject: [PATCH 27/31] Remove unused docker/minimax-m25-disagg/ directory The Dockerfile, build.sh, and duplicate minimax_m2.py patch were never used by the CI pipeline or local tests. --- docker/minimax-m25-disagg/Dockerfile | 91 --- docker/minimax-m25-disagg/build.sh | 31 - .../minimax-m25-disagg/patches/minimax_m2.py | 672 ------------------ 3 files changed, 794 deletions(-) delete mode 100644 docker/minimax-m25-disagg/Dockerfile delete mode 100644 docker/minimax-m25-disagg/build.sh delete mode 100644 docker/minimax-m25-disagg/patches/minimax_m2.py diff --git a/docker/minimax-m25-disagg/Dockerfile b/docker/minimax-m25-disagg/Dockerfile deleted file mode 100644 index 88e9ce764..000000000 --- a/docker/minimax-m25-disagg/Dockerfile +++ /dev/null @@ -1,91 +0,0 @@ -# MiniMax M2.5 PD Disaggregation Docker Image -# -# Extends the public vLLM ROCm image with: -# 1. WideEP + MoRI support for MiniMax M2.5 (minimax_m2.py patch) -# 2. Pre-installed runtime deps (UCX, RIXL, etcd, MoRI) -# 3. Disagg orchestration scripts baked in -# -# Build: -# docker build -t minimax-m25-disagg:latest -f docker/minimax-m25-disagg/Dockerfile . -# -# The image still sources setup_deps.sh at startup for idempotent patching -# (scheduler KV reaper, MoRI-IO read mode, etc.) but the heavy build steps -# (UCX, RIXL) are cached in the image layer. - -ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0 -FROM ${BASE_IMAGE} - -ARG ROCM_PATH=/opt/rocm -ARG UCX_HOME=/usr/local/ucx -ARG RIXL_HOME=/usr/local/rixl - -# ---------------------------------------------------------------- -# 1. Patch vLLM: MiniMax M2.5 WideEP + MoRI + EPLB support -# ---------------------------------------------------------------- -COPY docker/minimax-m25-disagg/patches/minimax_m2.py /tmp/patches/minimax_m2.py -RUN VLLM_MODELS=$(python3 -c "import vllm.model_executor.models; import os; print(os.path.dirname(vllm.model_executor.models.__file__))") && \ - cp /tmp/patches/minimax_m2.py "${VLLM_MODELS}/minimax_m2.py" && \ - echo "[DOCKER] Patched minimax_m2.py -> ${VLLM_MODELS}/minimax_m2.py" && \ - rm -rf /tmp/patches - -# ---------------------------------------------------------------- -# 2. Pre-install UCX build deps (speeds up setup_deps.sh at runtime) -# ---------------------------------------------------------------- -RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ - autoconf automake libtool pkg-config \ - librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ - infiniband-diags perftest ethtool rdma-core strace \ - && rm -rf /var/lib/apt/lists/* - -# ---------------------------------------------------------------- -# 3. Pre-build UCX (ROCm fork) — the longest step in setup_deps.sh -# ---------------------------------------------------------------- -RUN git clone --quiet https://github.com/ROCm/ucx.git /usr/local/src/ucx && \ - cd /usr/local/src/ucx && \ - git checkout da3fac2a && \ - ./autogen.sh && mkdir -p build && cd build && \ - ../configure \ - --prefix="${UCX_HOME}" \ - --enable-shared --disable-static \ - --disable-doxygen-doc --enable-optimizations \ - --enable-devel-headers --enable-mt \ - --with-rocm="${ROCM_PATH}" --with-verbs --with-dm && \ - make -j"$(nproc)" && make install && \ - rm -rf /usr/local/src/ucx - -# ---------------------------------------------------------------- -# 4. Pre-install etcd -# ---------------------------------------------------------------- -RUN ARCH=$(uname -m) && \ - if [ "$ARCH" = "x86_64" ]; then ETCD_ARCH=amd64; else ETCD_ARCH=arm64; fi && \ - ETCD_VER=v3.5.21 && \ - curl -fsSL "https://github.com/etcd-io/etcd/releases/download/${ETCD_VER}/etcd-${ETCD_VER}-linux-${ETCD_ARCH}.tar.gz" | \ - tar xz -C /usr/local/bin --strip-components=1 \ - "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcd" \ - "etcd-${ETCD_VER}-linux-${ETCD_ARCH}/etcdctl" && \ - etcd --version - -# ---------------------------------------------------------------- -# 5. Pre-install RIXL (Nixl KV transfer) -# ---------------------------------------------------------------- -RUN pip install --no-cache-dir nixl && \ - python3 -c "import nixl; print('RIXL installed:', nixl.__file__)" || \ - echo "[DOCKER] WARN: nixl pip install failed, will fallback to setup_deps.sh" - -# ---------------------------------------------------------------- -# 6. Copy disagg orchestration scripts into the image -# ---------------------------------------------------------------- -COPY benchmarks/multi_node/vllm_disagg_utils/ /opt/vllm_disagg/ -COPY benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh /opt/vllm_disagg/ - -# ---------------------------------------------------------------- -# 7. Environment -# ---------------------------------------------------------------- -ENV UCX_HOME=${UCX_HOME} \ - RIXL_HOME=${RIXL_HOME} \ - ROCM_PATH=${ROCM_PATH} \ - PATH="${UCX_HOME}/bin:/usr/local/bin:${PATH}" \ - LD_LIBRARY_PATH="${UCX_HOME}/lib:${LD_LIBRARY_PATH:-}" \ - PYTHONPYCACHEPREFIX=/tmp/pycache - -WORKDIR /workspace diff --git a/docker/minimax-m25-disagg/build.sh b/docker/minimax-m25-disagg/build.sh deleted file mode 100644 index b36227caf..000000000 --- a/docker/minimax-m25-disagg/build.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# Build the MiniMax M2.5 PD Disagg Docker image. -# -# Usage: -# cd -# bash docker/minimax-m25-disagg/build.sh [tag] [base_image] -# -# Examples: -# bash docker/minimax-m25-disagg/build.sh # default tag + base -# bash docker/minimax-m25-disagg/build.sh my-tag:v1 # custom tag -# bash docker/minimax-m25-disagg/build.sh latest vllm/vllm-openai-rocm:v0.19.0 # custom base -set -euo pipefail - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -TAG="${1:-minimax-m25-disagg:latest}" -BASE_IMAGE="${2:-vllm/vllm-openai-rocm:v0.18.0}" - -echo "Building MiniMax M2.5 Disagg image..." -echo " Tag: $TAG" -echo " Base image: $BASE_IMAGE" -echo " Context: $REPO_ROOT" - -docker build \ - -t "$TAG" \ - --build-arg BASE_IMAGE="$BASE_IMAGE" \ - -f "$REPO_ROOT/docker/minimax-m25-disagg/Dockerfile" \ - "$REPO_ROOT" - -echo "" -echo "Done. Image: $TAG" -echo "To push: docker tag $TAG /$TAG && docker push /$TAG" diff --git a/docker/minimax-m25-disagg/patches/minimax_m2.py b/docker/minimax-m25-disagg/patches/minimax_m2.py deleted file mode 100644 index c27b77ccf..000000000 --- a/docker/minimax-m25-disagg/patches/minimax_m2.py +++ /dev/null @@ -1,672 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2025 The MiniMax AI team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only MiniMaxM2/M2.5 model.""" - -from collections.abc import Iterable -from typing import Any - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm._aiter_ops import rocm_aiter_ops -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import ( - get_ep_group, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import sequence_parallel_chunk -from vllm.sequence import IntermediateTensors - -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) - -logger = init_logger(__name__) - - -class MiniMaxM2MoE(nn.Module): - """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. - - Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with - expert parallelism, EPLB, and sequence parallel awareness. - """ - - def __init__( - self, - config: PretrainedConfig, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ): - super().__init__() - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_group - self.ep_size = self.ep_group.size() - - self.n_routed_experts: int = config.num_local_experts - self.n_shared_experts: int = 0 - - self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) - self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - eplb_config = parallel_config.eplb_config - self.enable_eplb = parallel_config.enable_eplb - self.n_redundant_experts = eplb_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.use_routing_bias = getattr(config, "use_routing_bias", False) - if self.use_routing_bias: - self.e_score_correction_bias = nn.Parameter( - torch.empty(config.num_local_experts, dtype=torch.float32) - ) - self.e_score_correction_bias.weight_loader = ( - MiniMaxM2MoE.ebias_weight_loader - ) - else: - self.e_score_correction_bias = None - - self.gate = GateLinear( - config.hidden_size, - config.num_local_experts, - params_dtype=torch.float32, - prefix=f"{prefix}.gate", - ) - - self.experts = FusedMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - reduce_results=False, - renormalize=True, - scoring_func=getattr(config, "scoring_func", "softmax"), - e_score_correction_bias=self.e_score_correction_bias, - quant_config=quant_config, - prefix=f"{prefix}.experts", - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts, - is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=torch.float32, - gate=self.gate, - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, - ) - - @staticmethod - def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight.to(torch.float32)) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.is_sequence_parallel: - hidden_states = sequence_parallel_chunk(hidden_states) - - if self.experts.is_internal_router: - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=hidden_states - ) - else: - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) - - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states = final_hidden_states * self.routed_scaling_factor - - if self.is_sequence_parallel: - final_hidden_states = tensor_model_parallel_all_gather( - final_hidden_states, 0 - ) - final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -class MiniMaxM2Attention(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rotary_dim: int, - rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, - max_position_embeddings: int = 8192, - head_dim: int | None = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - if ( - rope_parameters is not None - and "partial_rotary_factor" not in rope_parameters - ): - rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim - self.rotary_emb = get_rope( - self.head_dim, - max_position=max_position_embeddings, - rope_parameters=rope_parameters, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - self.q_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_heads, eps=rms_norm_eps - ) - self.k_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = MiniMaxText01RMSNormTP.forward_qk( - self.q_norm, self.k_norm, q.contiguous(), k.contiguous() - ) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MiniMaxM2DecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): - max_position_embeddings = max( - config.max_position_embeddings, config.max_model_len - ) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep=".")[-1]) - - self.layer_idx = layer_idx - self.self_attn = MiniMaxM2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rotary_dim=config.rotary_dim, - rope_parameters=config.rope_parameters, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - self.block_sparse_moe = MiniMaxM2MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor | None, - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - - hidden_states = self.block_sparse_moe(hidden_states) - - return hidden_states, residual - - -@support_torch_compile -class MiniMaxM2Model(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=None, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MiniMaxM2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers", - ) - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None, - inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in self.layers[self.start_layer : self.end_layer]: - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="w1", - ckpt_down_proj_name="w2", - ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = self.get_expert_mapping() - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - - for param_name, weight_name, shard_id in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if ("mlp.experts." in name) and name not in params_dict: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( - param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id, - ) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MiniMaxM2MixtureOfExperts(MixtureOfExperts): - """EPLB protocol implementation for MiniMax M2/M2.5.""" - - moe_mlp_layers: list[MiniMaxM2MoE] - - def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("MiniMax M2: No MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for moe in self.moe_mlp_layers: - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - - -class MiniMaxM2ForCausalLM( - nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts -): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - if hasattr(vllm_config.model_config, "max_model_len"): - self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxM2Model( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None - ) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) - - self.num_moe_layers = config.num_hidden_layers - self._set_moe_parameters() - - def _set_moe_parameters(self): - self.expert_weights: list = [] - self.num_expert_groups = 1 - self.moe_layers: list = [] - self.moe_mlp_layers: list[MiniMaxM2MoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, MiniMaxM2DecoderLayer) - if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): - example_moe = layer.block_sparse_moe - self.moe_mlp_layers.append(layer.block_sparse_moe) - self.moe_layers.append(layer.block_sparse_moe.experts) - self.extract_moe_parameters(example_moe) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **kwargs, - ) -> torch.Tensor | IntermediateTensors: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states) - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name( - config: PretrainedConfig, weight_name: str -) -> int | None: - if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_mtp_modules): - if weight_name.startswith(f"model.layers.{layer_idx + i}."): - return layer_idx + i - return None From 185df53ba84fc07860bf2c636319a8c4d22f94cc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 13 Apr 2026 03:00:45 +0000 Subject: [PATCH 28/31] remove vllm disagg for dpsr1 and dpv3 Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 53 ------------- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 79 ------------------- .../multi_node/vllm_disagg_utils/models.yaml | 13 +-- 3 files changed, 1 insertion(+), 144 deletions(-) delete mode 100755 benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 111b505bf..32de6f552 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -992,59 +992,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" -dsr1-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp8 - framework: vllm-disagg - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - kimik2.5-fp4-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:v0.18.0 model: amd/Kimi-K2.5-MXFP4 diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh deleted file mode 100755 index b21e9204a..000000000 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - CONC_LIST \ - ISL \ - OSL \ - IMAGE \ - SPEC_DECODING \ - MODEL_PATH \ - PREFILL_NUM_WORKERS \ - PREFILL_TP \ - PREFILL_EP \ - PREFILL_DP_ATTN \ - DECODE_NUM_WORKERS \ - DECODE_TP \ - DECODE_EP \ - DECODE_DP_ATTN \ - PREFILL_NODES \ - DECODE_NODES \ - RANDOM_RANGE_RATIO - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -set -x - -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 - -export TIME_LIMIT="08:00:00" -export MODEL_PATH=$MODEL_PATH -export MODEL_NAME=$MODEL_NAME -export CONTAINER_IMAGE=$IMAGE - -# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh -if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then - export PREFILL_ENABLE_EP=false -else - export PREFILL_ENABLE_EP=true -fi - -if [[ "$PREFILL_DP_ATTN" == "true" ]]; then - export PREFILL_ENABLE_DP=true -else - export PREFILL_ENABLE_DP=false -fi - -if [[ "${DECODE_EP:-1}" -eq 1 ]]; then - export DECODE_ENABLE_EP=false -else - export DECODE_ENABLE_EP=true -fi - -if [[ "$DECODE_DP_ATTN" == "true" ]]; then - export DECODE_ENABLE_DP=true -else - export DECODE_ENABLE_DP=false -fi - -# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. -JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ - $PREFILL_NUM_WORKERS \ - $DECODE_NODES \ - $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf \ - ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ - ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ - ${PREFILL_TP} ${DECODE_TP} \ - ${RANDOM_RANGE_RATIO} \ - "${NODELIST:-}") - -if [[ $? -ne 0 ]]; then - echo "Failed to submit job" >&2 - exit 1 -fi - -echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index c6d27b5ae..c68bb46e3 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -12,7 +12,7 @@ # decode_flags: str # vLLM CLI flags for decode workers # env: str # Space-separated KEY=VALUE pairs exported before vllm serve # hf_dir: str # (optional) On-disk directory name if it differs from the key -# # e.g. HF cache layout: models--deepseek-ai--DeepSeek-R1-0528 +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 Llama-3.1-405B-Instruct-FP8-KV: prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" @@ -24,17 +24,6 @@ amd-Llama-3.3-70B-Instruct-FP8-KV: decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" -DeepSeek-V3: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - -DeepSeek-R1-0528: - prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" - hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" - Kimi-K2.5-MXFP4: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" From 48cc23aba3248a2ab96711e7922edc2773c700de Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 06:40:27 +0000 Subject: [PATCH 29/31] consolidate amd_utils for sglang and vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 72 +- benchmarks/multi_node/amd_utils/env.sh | 236 ++++--- benchmarks/multi_node/amd_utils/job.slurm | 467 ++++++------- .../models_vllm.yaml} | 0 .../moriio_proxy.py | 0 .../patches/minimax_m2.py | 0 benchmarks/multi_node/amd_utils/server.sh | 66 +- .../multi_node/amd_utils/server_sglang.sh | 624 ++++++++++++++++++ .../server.sh => amd_utils/server_vllm.sh} | 44 +- .../setup_deps.sh | 2 +- .../start_etcd.sh | 0 benchmarks/multi_node/amd_utils/submit.sh | 112 ++-- benchmarks/multi_node/amd_utils/sync.py | 5 +- .../dsr1_fp4_mi355x_sglang-disagg.sh | 3 +- .../dsr1_fp8_mi355x_sglang-disagg.sh | 3 +- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 5 +- .../minimaxm2.5_fp8_mi355x_vllm-disagg.sh | 5 +- .../multi_node/vllm_disagg_utils/bench.sh | 76 --- .../multi_node/vllm_disagg_utils/env.sh | 98 --- .../multi_node/vllm_disagg_utils/job.slurm | 358 ---------- .../multi_node/vllm_disagg_utils/submit.sh | 166 ----- .../multi_node/vllm_disagg_utils/sync.py | 201 ------ 22 files changed, 1195 insertions(+), 1348 deletions(-) rename benchmarks/multi_node/{vllm_disagg_utils/models.yaml => amd_utils/models_vllm.yaml} (100%) rename benchmarks/multi_node/{vllm_disagg_utils => amd_utils}/moriio_proxy.py (100%) rename benchmarks/multi_node/{vllm_disagg_utils => amd_utils}/patches/minimax_m2.py (100%) create mode 100755 benchmarks/multi_node/amd_utils/server_sglang.sh rename benchmarks/multi_node/{vllm_disagg_utils/server.sh => amd_utils/server_vllm.sh} (95%) rename benchmarks/multi_node/{vllm_disagg_utils => amd_utils}/setup_deps.sh (99%) rename benchmarks/multi_node/{vllm_disagg_utils => amd_utils}/start_etcd.sh (100%) delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/bench.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/env.sh delete mode 100644 benchmarks/multi_node/vllm_disagg_utils/job.slurm delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/submit.sh delete mode 100755 benchmarks/multi_node/vllm_disagg_utils/sync.py diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..87f3b1e8a 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,81 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -head_node="localhost" -head_port="30000" +ROUTER_PORT="${ROUTER_PORT:-30000}" +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm" ]]; then + extra_flags="--trust-remote-code" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$MODEL_PATH" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..c5a438541 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,99 +1,184 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 - exit 1 + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) +# Shared: Auto-detect default network interface (portable across clusters) export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION -export SGLANG_MORI_FP8_DISP=True +if [[ "$ENGINE" == "vllm" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + set -x -if [[ "$MODEL_NAME" == *mxfp4* ]]; then -export SGLANG_MORI_FP8_DISP=False -fi + # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export SGLANG_MORI_FP4_DISP=False -export SGLANG_MORI_FP8_COMB=False + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) -export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 -fi -export MORI_MAX_DISPATCH_TOKENS_DECODE=160 - -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) - -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 - -export MORI_APP_LOG_LEVEL=INFO - -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" - -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi + + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" + +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= + + export SGLANG_USE_AITER=1 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 + + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION + export SGLANG_MORI_FP8_DISP=True + + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export SGLANG_MORI_FP8_DISP=False + fi + + export SGLANG_MORI_FP4_DISP=False + export SGLANG_MORI_FP8_COMB=False + + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 + fi + export MORI_MAX_DISPATCH_TOKENS_DECODE=160 + + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + + export MORI_APP_LOG_LEVEL=INFO + + # Router logging control + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + + # QoS/DSCP configuration + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 @@ -102,25 +187,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p { export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 2f88250b5..172d9e73b 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,260 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# ============================================================================= +# Docker privilege detection +# ============================================================================= +# Detect on the batch host. Per-node detection happens inside srun below. +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') - # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +264,16 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -297,38 +287,101 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" - +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -340,10 +393,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true -exec sudo docker run --rm \ +exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -366,50 +419,18 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log @@ -422,4 +443,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml similarity index 100% rename from benchmarks/multi_node/vllm_disagg_utils/models.yaml rename to benchmarks/multi_node/amd_utils/models_vllm.yaml diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py similarity index 100% rename from benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py rename to benchmarks/multi_node/amd_utils/moriio_proxy.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py similarity index 100% rename from benchmarks/multi_node/vllm_disagg_utils/patches/minimax_m2.py rename to benchmarks/multi_node/amd_utils/patches/minimax_m2.py diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 9ed395bb4..0d5685a4d 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,63 +1,23 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh +ENGINE="${ENGINE:-sglang}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +if [[ "$ENGINE" == "vllm" ]]; then + source "$WS_PATH/server_vllm.sh" else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." + source "$WS_PATH/server_sglang.sh" fi +<<<<<<< HEAD # ============================================================================= # Model-Specific Configuration from YAML @@ -703,3 +663,5 @@ fi echo "Script completed successfully" exit 0 +======= +>>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..53ca29cc5 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,624 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# Use Decode configuration to configure different TP/DP size between P and D +PREFILL_DECODE_DIFFERENT_TP="" +if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then + if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" + else + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" + fi +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "================================================" + + # start the head prefill server + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh similarity index 95% rename from benchmarks/multi_node/vllm_disagg_utils/server.sh rename to benchmarks/multi_node/amd_utils/server_vllm.sh index 9b0ff2ebb..a10e45d6d 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -49,7 +49,7 @@ MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" # ============================================================================= # Dependencies and Environment Setup # ============================================================================= -source $VLLM_WS_PATH/env.sh +source $WS_PATH/env.sh host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') # RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) @@ -110,7 +110,7 @@ fi # ============================================================================= # Model-Specific Configuration from YAML # ============================================================================= -MODELS_YAML="${VLLM_WS_PATH}/models.yaml" +MODELS_YAML="${WS_PATH}/models_vllm.yaml" if [[ ! -f "$MODELS_YAML" ]]; then echo "ERROR: models.yaml not found at $MODELS_YAML" @@ -150,19 +150,19 @@ print(f'DECODE_MODEL_ENVS=\"{dev}\"') echo "Loaded model configuration for: $MODEL_NAME" -# Apply tensor-parallel size and EP/DP flags from submit pipeline (YAML PREFILL_TP / dp-attn / ep). -if [[ -n "${PREFILL_TP:-}" ]]; then +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP}/g") + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") else - PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP}" + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" fi fi -if [[ -n "${DECODE_TP:-}" ]]; then +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP}/g") + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") else - DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP}" + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" fi fi if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then @@ -186,7 +186,7 @@ echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" # ============================================================================= echo "Waiting at the container creation barrier on $host_name" -python3 $VLLM_WS_PATH/sync.py barrier \ +python3 $WS_PATH/sync.py barrier \ --local-ip ${host_ip} \ --local-port 5000 \ --enable-port \ @@ -200,11 +200,11 @@ python3 $VLLM_WS_PATH/sync.py barrier \ # ============================================================================= echo "Proceeding to start etcd server on $host_name" -bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & etcd_pid=$! echo "Waiting at etcd server barrier on $host_name" -python3 $VLLM_WS_PATH/sync.py barrier \ +python3 $WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 2379 \ --wait-for-all-ports \ @@ -217,7 +217,7 @@ echo "etcd endpoint health==================" etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true echo "======================================" -python3 $VLLM_WS_PATH/sync.py barrier \ +python3 $WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 2379 \ --wait-for-all-ports \ @@ -284,7 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $VLLM_WS_PATH/moriio_proxy.py" + python3 $WS_PATH/moriio_proxy.py" if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PROXY_CMD" @@ -317,7 +317,7 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: skipping barrier (wait-for-all-ports)" else - python3 $VLLM_WS_PATH/sync.py barrier \ + python3 $WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports $SERVER_PORT \ --wait-for-all-ports \ @@ -327,7 +327,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Congratulations!!! All prefill and decode servers are up . . ." # Wait for proxy /health to confirm it is accepting requests - HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports ${ROUTER_PORT} \ --wait-for-all-health \ @@ -343,10 +343,10 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Ready for benchmarking on ${host_name}:${host_ip}" echo "Benchmarking on ${host_name}:${host_ip}" - cd $VLLM_WS_PATH + cd $WS_PATH export ROUTER_PORT=$ROUTER_PORT - BENCH_CMD="bash $VLLM_WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" @@ -401,7 +401,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then fi echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports ${ROUTER_PORT} \ --wait-for-all-ports \ @@ -414,7 +414,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then fi echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + WAIT_CMD="python3 $WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port ${ROUTER_PORT}" @@ -455,7 +455,7 @@ else fi echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports ${ROUTER_PORT} \ --wait-for-all-ports \ @@ -468,7 +468,7 @@ else fi echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + WAIT_CMD="python3 $WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port ${ROUTER_PORT}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh similarity index 99% rename from benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh rename to benchmarks/multi_node/amd_utils/setup_deps.sh index 7f691d141..8c7a9f07a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -839,7 +839,7 @@ except Exception as e: # MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. # --------------------------------------------------------------------------- patch_minimax_m2_wideep_mori() { - local patch_file="${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}/patches/minimax_m2.py" + local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" if [[ ! -f "$patch_file" ]]; then # Also check the Docker-baked location patch_file="/opt/vllm_disagg/patches/minimax_m2.py" diff --git a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh similarity index 100% rename from benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh rename to benchmarks/multi_node/amd_utils/start_etcd.sh diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index be22b8d33..0b1c2b2f6 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -117,13 +137,10 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -136,6 +153,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -144,6 +168,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -153,7 +178,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh index b21e9204a..d7995fb25 100755 --- a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -27,7 +28,7 @@ fi set -x -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh index 137ee0381..a9a28d889 100644 --- a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -27,7 +28,7 @@ fi set -x -cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh deleted file mode 100755 index 274c5954e..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# vLLM Disaggregated Benchmark Runner -# -# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh) -# so that the CI pipeline can collect and process results. -# -# Usage: bash bench.sh \ -# \ -# - -n_prefill=$1 -n_decode=$2 -prefill_gpus=$3 -decode_gpus=$4 -model_path=$5 -model_name=$6 -MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" -log_path=$7 - -chosen_isl=${8:-1024} -chosen_osl=${9:-1024} -concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-inf} -random_range_ratio=${12:-0.8} -num_prompts_multiplier=${13:-10} - -IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" - -ROUTER_PORT="${ROUTER_PORT:-30000}" - -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p "$profile_folder" - -source "$(dirname "$0")/../../benchmark_lib.sh" - -REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" - -for max_concurrency in "${chosen_concurrencies[@]}"; do - - export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" - - num_prompts=$(( max_concurrency * num_prompts_multiplier )) - if [[ "$num_prompts" -lt 16 ]]; then - num_prompts=16 - fi - - echo "profile_folder: $profile_folder" - echo "max_concurrency: $max_concurrency" - echo "chosen_req_rate: $chosen_req_rate" - echo "MODEL_PATH: $MODEL_PATH" - echo "ROUTER_PORT: $ROUTER_PORT" - echo "chosen_isl: $chosen_isl" - echo "chosen_osl: $chosen_osl" - echo "num_prompts: $num_prompts" - echo "export_file: $export_file" - - run_benchmark_serving \ - --bench-serving-dir "$REPO_ROOT" \ - --model "$MODEL_PATH" \ - --port "$ROUTER_PORT" \ - --backend openai \ - --input-len "$chosen_isl" \ - --output-len "$chosen_osl" \ - --random-range-ratio "$random_range_ratio" \ - --num-prompts "$num_prompts" \ - --max-concurrency "$max_concurrency" \ - --result-filename "$export_file" \ - --result-dir /workspace/ \ - --trust-remote-code - - echo "-----------------------------------------" - echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." - sleep 10 -done diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh deleted file mode 100755 index e1cc2f6af..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -# vLLM/Nixl environment setup for multi-node disaggregated serving. -# -# REQUIRED ENVIRONMENT VARIABLES: -# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# Set by runner or auto-detected from hostname. -# -# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is -# sourced at the top of server.sh before this file. - -set -x - -# IBDEVICES configuration -# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) -# Fall back to hostname detection if not set (for direct script execution) -if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 - else - DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') - if [[ -n "$DETECTED" ]]; then - export IBDEVICES="$DETECTED" - else - echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 - fi - fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" -else - echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" -fi - -if [[ -z "$UCX_NET_DEVICES" ]]; then - # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC). - # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1) - # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider). - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) - if [[ -n "$UCX_NET_DEV" ]]; then - export UCX_NET_DEVICES="$UCX_NET_DEV" - else - FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) - if [[ -n "$FIRST_IB" ]]; then - export UCX_NET_DEVICES="${FIRST_IB}:1" - fi - fi - echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" -else - echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" -fi - -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} - -# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing -export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} - -# QoS/DSCP configuration for lossless RoCEv2 fabric. -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then - echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' -$1 == "DSCP" && $2 == ":" && $NF == p { - print $3; exit -}') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) - export UCX_IB_SL=$ND_PRIO - echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" - else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export UCX_IB_TRAFFIC_CLASS=96 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export UCX_IB_TRAFFIC_CLASS=104 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - fi - fi -else - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export UCX_IB_TRAFFIC_CLASS=96 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export UCX_IB_TRAFFIC_CLASS=104 - echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" - else - echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." - fi -fi - -set +x -echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm deleted file mode 100644 index e1cad0817..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ /dev/null @@ -1,358 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=vllm-pd-bench -#SBATCH -N 3 # Overridden by submit.sh -N flag -#SBATCH -n 3 # Overridden by submit.sh -n flag -#SBATCH --ntasks-per-node=1 -#SBATCH --spread-job -#SBATCH --gres=gpu:8 -#SBATCH --time=24:00:00 -# --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR - -echo "=== Job Start Time ===" -echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" -echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" -echo "=======================" -echo "" - -# ============================================================================= -# Model Validation -# ============================================================================= - -# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ -# at runtime, but the CWD remains the submit-time directory (vllm_disagg_utils/). -MODELS_YAML="$(pwd)/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -MODEL_NAME="${MODEL_NAME:-None}" -if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" - echo "Available models:" - grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' - exit 1 -fi -echo "Model found: $MODEL_NAME" - -RUN_FILE="server.sh" -echo "Runfile set: $RUN_FILE" - -# DI_REPO_DIR points to the repo root. -# $(pwd) is vllm_disagg_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. -export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) - -xP="${xP:-1}" -yD="${yD:-1}" - -# Benchmark configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" - -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - -# ============================================================================= -# Docker privilege detection -# ============================================================================= -# Detect on the batch host (used for post-srun cleanup). -# Per-node detection happens inside the srun inline script below because -# some nodes may require sudo while others do not. -if docker ps &>/dev/null; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD - -# ============================================================================= -# Model Path Resolution -# ============================================================================= - -# MODEL_DIR detection: prefer env var, fall back to hostname detection -if [[ -z "$MODEL_DIR" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - MODEL_DIR="/nfsdata" - elif [[ $NODENAME == mia1* ]]; then - MODEL_DIR="/it-share/data" - else - MODEL_DIR="/nfsdata" - fi - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" -fi -export MODEL_DIR - -# Extract hf_dir from models.yaml (the line after the model's top-level key) -DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} - found && /^[^ ]/{exit} - found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") -DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" -echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" - -resolve_hf_cache_path() { - local base_path=$1 - if [[ -d "${base_path}/snapshots" ]]; then - local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) - if [[ -n "$snapshot" ]]; then - echo "${base_path}/snapshots/${snapshot}" - return 0 - fi - fi - echo "$base_path" - return 1 -} - -MODEL_PATH="" -SEARCH_PATHS=( - "${MODEL_DIR}/${DISK_DIR_NAME}" - "${MODEL_DIR}/${MODEL_NAME}" - "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" - "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" -) - -for search_path in "${SEARCH_PATHS[@]}"; do - if [[ -d "$search_path" ]]; then - RESOLVED=$(resolve_hf_cache_path "$search_path") - MODEL_PATH="$RESOLVED" - echo "Found MODEL_PATH: $MODEL_PATH" - break - fi -done - -if [[ -z "$MODEL_PATH" ]]; then - echo "FATAL: Model '$MODEL_NAME' not found. Searched:" - for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done - exit 1 -fi -echo "Final MODEL_PATH: $MODEL_PATH" - -# ============================================================================= -# Node Selection and vLLM-Specific NUM_NODES -# ============================================================================= - -# Router co-located with first prefill: xP + yD nodes total (same as SGLang) -NUM_NODES=$((xP + yD)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located with first prefill)" - -FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) -SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') - -# Update SLURM environment variables -export SLURM_NNODES=$NUM_NODES -export SLURM_NTASKS=$NUM_NODES -export SLURM_JOB_NUM_NODES=$NUM_NODES -export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_NTASKS_PER_NODE=1 - -echo "" -echo "Selected nodes: $SELECTED_NODELIST_STR" - -# ============================================================================= -# IP Resolution -# ============================================================================= - -USER_NAME=$(whoami) -MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) -NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') -NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') - -IPS=() -for NODE in $SELECTED_NODES; do - IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') - IP=$(echo "$IP" | awk '/src/ {print $7}') - IPS+=("$IP") -done - -echo "Node IPs: ${IPS[*]}" - -DOCKER_MOUNT_PATH="/workspace" -VLLM_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/vllm_disagg_utils" - -NNODES=$NUM_NODES - -echo "MASTER_NODE: ${MASTER_NODE}" -echo "NODE0_ADDR: ${NODE0_ADDR}" -echo "NNODES: ${NNODES}" -echo "REPO DIR: ${DI_REPO_DIR}" -echo "USER: ${USER_NAME}" - -# Reduce log spam -export TQDM_MININTERVAL=20 - -# Translate the host-resolved MODEL_PATH to the Docker mount namespace -DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" - -export DI_REPO_DIR=$DI_REPO_DIR -export VLLM_WS_PATH=$VLLM_WS_PATH -export NNODES=$NNODES -export NODE0_ADDR=$NODE0_ADDR -export MODEL_PATH=$MODEL_PATH -export MODEL_DIR=$MODEL_DIR -export xP=$xP -export yD=$yD -export MODEL_NAME=$MODEL_NAME -export USER_NAME=$USER_NAME -export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export GPUS_PER_NODE=$GPUS_PER_NODE -export BENCH_INPUT_LEN=$BENCH_INPUT_LEN -export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN -export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO -export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER -export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY -export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE -export DRY_RUN="${DRY_RUN:-0}" -export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" - -# TP / EP / DP (from vllm_disagg_utils/submit.sh; mirrors amd_utils disagg) -export PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" -export PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" -export DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" -export DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" -export PREFILL_TP="${PREFILL_TP:-8}" -export DECODE_TP="${DECODE_TP:-8}" - -SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" - -SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - -cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - echo "[${SLURM_JOB_ID}] cleanup done." -} - -trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes -echo "Refreshing NFS caches on all nodes..." -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' - sync - ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils > /dev/null 2>&1 - stat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 - cat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true - echo "NFS cache refreshed on $(hostname)" -' - -srun \ - --nodelist="$SELECTED_NODELIST_SRUN" \ - --kill-on-bad-exit=1 \ - --signal=TERM@30 \ - --unbuffered \ - bash -lc " -set -euo pipefail - -echo \"Rank \$SLURM_PROCID on \$(hostname)\" - -# Per-node Docker privilege detection (some nodes need sudo, others don't) -if docker ps &>/dev/null; then - _DCMD=docker -else - _DCMD='sudo docker' -fi - -# Pre-clean (idempotent) -\$_DCMD ps -aq --filter \"name=^container_vllm_\" | xargs -r \$_DCMD rm -f || true -\$_DCMD ps -aq | xargs -r \$_DCMD stop || true - -exec \$_DCMD run --rm \ - --init \ - --stop-timeout 10 \ - --device /dev/dri \ - --device /dev/kfd \ - --device /dev/infiniband \ - --device=/dev/infiniband/rdma_cm \ - --device=/dev/infiniband/uverbs0 \ - --device=/dev/infiniband/uverbs1 \ - --device=/dev/infiniband/uverbs2 \ - --device=/dev/infiniband/uverbs3 \ - --device=/dev/infiniband/uverbs4 \ - --device=/dev/infiniband/uverbs5 \ - --device=/dev/infiniband/uverbs6 \ - --device=/dev/infiniband/uverbs7 \ - --ulimit memlock=-1 \ - --ulimit stack=67108864 \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v /sys:/sys \ - $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ - -v ${MODEL_DIR}:/models \ - -v \$HOME/.ssh:/root/.ssh \ - --shm-size 128G \ - -v /tmp:/run_logs \ - -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ - -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e MODEL_NAME=\$MODEL_NAME \ - -e MODEL_PATH=$DOCKER_MODEL_PATH \ - -e VLLM_WS_PATH=${VLLM_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e IPADDRS=\$IPADDRS \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ - -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ - -e UCX_MEMTYPE_CACHE=y \ - -e UCX_RNDV_SCHEME=get_zcopy \ - -e UCX_RNDV_THRESH=4k \ - -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ - -e UCX_LOG_LEVEL=warn \ - -e HSA_ENABLE_SDMA=1 \ - -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ - -e PYTHONPYCACHEPREFIX=/tmp/pycache \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e PREFILL_TP=\$PREFILL_TP \ - -e DECODE_TP=\$DECODE_TP \ - --name \"$DOCKER_CONT_NAME\" \ - --entrypoint \"\" \ - \"$DOCKER_IMAGE_NAME\" bash -lc ' - mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' - '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log - ' - -DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi -" - -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'if docker ps &>/dev/null; then D=docker; else D="sudo docker"; fi; $D rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh deleted file mode 100755 index ecb5a9876..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash -# -# Cluster Configuration Template for Multi-Node vLLM Disaggregated Serving -# -# This script submits a multi-node vLLM disaggregated benchmark job to SLURM. -# It must be configured for your specific cluster before use. -# -# Router is co-located with the first prefill node (same as SGLang), so -# NUM_NODES = PREFILL_NODES + DECODE_NODES. - -usage() { - cat << 'USAGE' -Usage: - bash submit.sh \ - \ - \ - \ - \ - [NODE_LIST] - -Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - PREFILL_ENABLE_EP true/false (from PREFILL_EP in YAML; false when EP==1) - PREFILL_ENABLE_DP true/false (data-parallel attention on prefill) - DECODE_ENABLE_EP true/false (from DECODE_EP in YAML) - DECODE_ENABLE_DP true/false (data-parallel attention on decode) - PREFILL_TP Tensor parallel size per prefill node - DECODE_TP Tensor parallel size per decode node - RANDOM_RANGE_RATIO Random range ratio for benchmark client - NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) - -Required environment variables: - SLURM_ACCOUNT SLURM account name - SLURM_PARTITION SLURM partition - TIME_LIMIT Job time limit (e.g., "08:00:00") - MODEL_PATH Path to model directory (e.g., /nfsdata) - MODEL_NAME Model name directory - CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) - RUNNER_NAME Runner identifier (for job name) -USAGE -} - -check_env() { - local name="$1" - if [[ -z "${!name:-}" ]]; then - echo "Error: ${name} not specified" >&2 - usage >&2 - exit 1 - fi -} - -check_env SLURM_ACCOUNT -check_env SLURM_PARTITION -check_env TIME_LIMIT - -check_env MODEL_PATH -check_env MODEL_NAME -check_env CONTAINER_IMAGE -check_env RUNNER_NAME - -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - -# COMMAND_LINE ARGS (aligned with benchmarks/multi_node/amd_utils/submit.sh) -PREFILL_NODES=$1 -PREFILL_WORKERS=${2:-1} -DECODE_NODES=$3 -DECODE_WORKERS=${4:-1} -ISL=$5 -OSL=$6 -CONCURRENCIES=$7 -REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-false} -PREFILL_ENABLE_DP=${10:-false} -DECODE_ENABLE_EP=${11:-false} -DECODE_ENABLE_DP=${12:-false} -PREFILL_TP=${13:-8} -DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15:-0.8} -NODE_LIST=${16} - -# Router co-located with first prefill: xP + yD nodes total -NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) -profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" - -# Export variables for the SLURM job -export MODEL_DIR=$MODEL_PATH -export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE -export PROFILER_ARGS=$profiler_args - -# For vLLM, each worker = 1 node (TP=8 per node). -# xP/yD must match the node counts so NUM_NODES = xP+yD is correct. -export xP=$PREFILL_NODES -export yD=$DECODE_NODES -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME -export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} -export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} -export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} -export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} -export PREFILL_TP=${PREFILL_TP} -export DECODE_TP=${DECODE_TP} -export BENCH_INPUT_LEN=${ISL} -export BENCH_OUTPUT_LEN=${OSL} -export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} -export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} -export BENCH_REQUEST_RATE=${REQUEST_RATE} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} - -export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} -export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} - -# Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" -mkdir -p "$BENCHMARK_LOGS_DIR" - -# Optional: pass an explicit node list to sbatch. -NODELIST_OPT=() -if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then - IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" - if [[ "${#NODE_ARR[@]}" -ne "$NUM_NODES" ]]; then - echo "Error: NODE_LIST has ${#NODE_ARR[@]} nodes but NUM_NODES=${NUM_NODES}" >&2 - echo "Error: NODE_LIST='${NODE_LIST}'" >&2 - exit 1 - fi - NODELIST_CSV="$(IFS=,; echo "${NODE_ARR[*]}")" - NODELIST_OPT=(--nodelist "$NODELIST_CSV") -fi - -# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). -# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. -EXCLUDE_OPT=() -if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then - EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") -fi - -# Construct the sbatch command -sbatch_cmd=( - sbatch - --parsable - -N "$NUM_NODES" - -n "$NUM_NODES" - "${NODELIST_OPT[@]}" - "${EXCLUDE_OPT[@]}" - --time "$TIME_LIMIT" - --partition "$SLURM_PARTITION" - --account "$SLURM_ACCOUNT" - --job-name "$RUNNER_NAME" - --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" - --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" - "$(dirname "$0")/job.slurm" -) - -JOB_ID=$("${sbatch_cmd[@]}") -if [[ $? -ne 0 ]]; then - echo "Error: Failed to submit job with sbatch" >&2 - exit 1 -fi -echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py deleted file mode 100755 index 3678e7614..000000000 --- a/benchmarks/multi_node/vllm_disagg_utils/sync.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 -""" -Multi-node synchronization utilities for disaggregated inference. - -Subcommands: - barrier - Wait until all specified nodes have opened their ports (TCP barrier) - Optionally wait for HTTP health endpoints to return 200 - wait - Block until a remote port closes (shutdown coordination) -""" - -import socket -import time -import threading -import argparse -import sys -import urllib.request -import urllib.error - - -def is_port_open(ip, port, timeout=2): - """Check if a given IP and port are accessible.""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.settimeout(timeout) - return s.connect_ex((ip, port)) == 0 - - -def check_health(ip, port, path="/health", timeout=2): - """Return True if http://ip:port/path returns HTTP 200.""" - try: - url = f"http://{ip}:{port}{path}" - req = urllib.request.Request(url) - with urllib.request.urlopen(req, timeout=timeout) as resp: - return getattr(resp, "status", 200) == 200 - except (urllib.error.URLError, urllib.error.HTTPError, OSError): - return False - - -# ============================================================================= -# barrier subcommand -# ============================================================================= - -def cmd_barrier(args): - """Wait until all nodes have opened the specified ports.""" - NODE_IPS = [ip.strip() for ip in args.node_ips.split(",") if ip.strip()] - NODE_PORTS = [int(p.strip()) for p in args.node_ports.split(",") if p.strip()] - - if not NODE_IPS: - print("Error: NODE_IPS argument is empty or not set.") - sys.exit(1) - - if len(NODE_PORTS) == 1: - NODE_PORTS *= len(NODE_IPS) - elif len(NODE_PORTS) != len(NODE_IPS): - print("Error: Number of ports must match number of node IPs or only one port should be given for all.") - sys.exit(1) - - server_socket = None - - def open_port(): - nonlocal server_socket - server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server_socket.bind((args.local_ip, args.local_port)) - server_socket.listen(5) - print(f"Port {args.local_port} is now open on {args.local_ip}.") - while True: - conn, addr = server_socket.accept() - conn.close() - - def close_port(): - nonlocal server_socket - if server_socket: - server_socket.close() - print(f"Port {args.local_port} has been closed on {args.local_ip}.") - - if args.enable_port: - threading.Thread(target=open_port, daemon=True).start() - - # Wait for all ports (TCP check) - if args.wait_for_all_ports: - start_time = time.time() - timeout = args.timeout - - while True: - if timeout > 0: - elapsed = time.time() - start_time - if elapsed >= timeout: - not_open = [(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS) - if not is_port_open(ip, port)] - print(f"ERROR: Timeout after {timeout} seconds waiting for ports to open.", flush=True) - print("The following nodes/ports are still not responding:", flush=True) - for ip, port in not_open: - print(f" - {ip}:{port}", flush=True) - sys.exit(1) - - all_open = all(is_port_open(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS)) - if all_open: - break - - if timeout > 0: - remaining = timeout - (time.time() - start_time) - print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . . ({remaining:.0f}s remaining)", flush=True) - else: - print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . .", flush=True) - time.sleep(5) - - # Wait for all health endpoints (HTTP check) - if args.wait_for_all_health: - health_path = args.health_endpoint - start_time = time.time() - timeout = args.timeout - - while True: - if timeout > 0: - elapsed = time.time() - start_time - if elapsed >= timeout: - not_ready = [ - (ip, port) - for ip, port in zip(NODE_IPS, NODE_PORTS) - if not check_health(ip, port, health_path) - ] - print(f"ERROR: Timeout after {timeout} seconds waiting for health endpoints.", flush=True) - print(f"The following (http://ip:port{health_path}) are still not responding:", flush=True) - for ip, port in not_ready: - print(f" - http://{ip}:{port}{health_path}", flush=True) - sys.exit(1) - - all_ready = all( - check_health(ip, port, health_path) - for ip, port in zip(NODE_IPS, NODE_PORTS) - ) - if all_ready: - break - - if timeout > 0: - remaining = timeout - (time.time() - start_time) - print( - f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) .. ({remaining:.0f}s remaining)", - flush=True, - ) - else: - print(f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) ..", flush=True) - time.sleep(30) - - if args.enable_port: - # Keep the port open long enough for slow nodes to pass their barrier. - # The previous 30s was too short when setup times vary by minutes. - grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 - time.sleep(grace) - close_port() - - -# ============================================================================= -# wait subcommand -# ============================================================================= - -def cmd_wait(args): - """Wait while a remote port remains open, exit when it closes.""" - print(f"Waiting while port {args.remote_port} on {args.remote_ip} is open...") - while is_port_open(args.remote_ip, args.remote_port): - time.sleep(5) - print(f"Port {args.remote_port} on {args.remote_ip} is now closed.") - - -# ============================================================================= -# CLI -# ============================================================================= - -def main(): - parser = argparse.ArgumentParser(description="Multi-node synchronization utilities.") - subparsers = parser.add_subparsers(dest="command", required=True) - - # barrier subcommand - bp = subparsers.add_parser("barrier", help="Wait for all nodes to open specified ports.") - bp.add_argument("--local-ip", required=False, help="Local IP address to bind the server.") - bp.add_argument("--local-port", type=int, required=False, help="Port number to bind the server.") - bp.add_argument("--enable-port", action="store_true", help="Enable opening and closing of local port.") - bp.add_argument("--node-ips", required=True, help="Comma-separated list of node IPs.") - bp.add_argument("--node-ports", required=True, help="Comma-separated list of ports to check.") - bp.add_argument("--timeout", type=int, default=600, - help="Timeout in seconds (default: 600). Set to 0 for no timeout.") - bp.add_argument("--wait-for-all-ports", action="store_true", - help="Wait until all node ports are open (TCP).") - bp.add_argument("--wait-for-all-health", action="store_true", - help="Wait until http://ip:port/health returns 200 for all nodes.") - bp.add_argument("--health-endpoint", default="/health", - help="Path for health check (default: /health).") - bp.set_defaults(func=cmd_barrier) - - # wait subcommand - wp = subparsers.add_parser("wait", help="Wait while a remote port remains open.") - wp.add_argument("--remote-ip", required=True, help="Remote server IP address.") - wp.add_argument("--remote-port", type=int, required=True, help="Remote port number.") - wp.set_defaults(func=cmd_wait) - - args = parser.parse_args() - args.func(args) - - -if __name__ == "__main__": - main() From 5adfe2b92df4d920a95dc509b9832da315cacbf8 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 07:57:08 +0000 Subject: [PATCH 30/31] use vLLM router as default router for vllm disagg Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/job.slurm | 34 ++++++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 40 +++++++++++-------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 172d9e73b..bcafaa910 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -77,6 +77,11 @@ PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + # ============================================================================= # Docker privilege detection # ============================================================================= @@ -288,6 +293,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" + +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) @@ -396,6 +405,24 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \\ + --name \"$ROUTER_CONT_NAME\" \\ + --network host \\ + \"$VLLM_ROUTER_IMAGE\" \\ + vllm-router \\ + --vllm-pd-disaggregation \\ + --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ + --port \"${ROUTER_PORT}\" \\ + --host 0.0.0.0 \\ + --policy consistent_hash \\ + --prefill-policy consistent_hash \\ + --decode-policy consistent_hash \\ + --log-level info +fi + exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ @@ -444,3 +471,10 @@ fi " srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + +# Clean up vLLM external router container on node 0 +if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' +fi diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index a10e45d6d..6b70014ee 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -282,19 +282,24 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" + # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 + echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" fi PREFILL_CMD="vllm serve ${MODEL_PATH} \ @@ -368,13 +373,16 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server and prefill server" + echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - # Fallback: ensure no orphaned processes keep ports open - pkill -f moriio_proxy 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + pkill -f moriio_proxy 2>/dev/null || true + fi pkill -f "vllm serve" 2>/dev/null || true fi From 0734709418f0b9c69247e6522824290d56bf6d36 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 23 Apr 2026 01:49:52 +0000 Subject: [PATCH 31/31] fix bugs Signed-off-by: Chun Fang --- benchmarks/multi_node/amd_utils/bench.sh | 6 +- benchmarks/multi_node/amd_utils/env.sh | 4 +- benchmarks/multi_node/amd_utils/job.slurm | 60 +- benchmarks/multi_node/amd_utils/server.sh | 656 +----------------- .../multi_node/amd_utils/server_vllm.sh | 54 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 10 +- benchmarks/multi_node/amd_utils/submit.sh | 2 +- 7 files changed, 74 insertions(+), 718 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 87f3b1e8a..aecc29e83 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -11,7 +11,7 @@ # \ # -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -67,7 +67,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code" else if [ "$IS_MTP" = "true" ]; then @@ -92,7 +92,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "-----------------------------------------" # vLLM: cooldown between rounds for idle KV block reaper - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." sleep 10 fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c5a438541..81da415e8 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -9,7 +9,7 @@ # Set by runner or auto-detected from hostname. set -x -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 # ============================================================================= @@ -43,7 +43,7 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # Engine-specific environment # ============================================================================= -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= # vLLM/Nixl-specific environment # ========================================================================= diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index bcafaa910..abb80b97b 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -8,7 +8,7 @@ #SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" @@ -23,7 +23,7 @@ echo "" # Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ # at runtime, but the CWD remains the submit-time directory (amd_utils/). -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then MODELS_YAML="$(pwd)/models_vllm.yaml" else MODELS_YAML="$(pwd)/models.yaml" @@ -111,7 +111,7 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} found && /^[^ ]/{exit} @@ -278,6 +278,7 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) @@ -366,7 +367,7 @@ DOCKER_ENV_COMMON=( ) # Engine-specific env vars -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then DOCKER_ENV_ENGINE=( -e VLLM_WS_PATH=${WS_PATH} -e MODEL_PATH=$DOCKER_MODEL_PATH @@ -402,28 +403,29 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true -\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true # Start vLLM external router container on node 0 -if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true - \$DOCKER_CMD run -d \\ - --name \"$ROUTER_CONT_NAME\" \\ - --network host \\ - \"$VLLM_ROUTER_IMAGE\" \\ - vllm-router \\ - --vllm-pd-disaggregation \\ - --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ - --port \"${ROUTER_PORT}\" \\ - --host 0.0.0.0 \\ - --policy consistent_hash \\ - --prefill-policy consistent_hash \\ - --decode-policy consistent_hash \\ - --log-level info + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run --rm \ +exec \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -470,11 +472,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' -# Clean up vLLM external router container on node 0 -if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then - srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true - ' -fi + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 0d5685a4d..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -2,666 +2,18 @@ # Dual-Engine Disaggregated Server Dispatcher # ============================================================================= # Dispatches to the engine-specific server launcher based on ENGINE env var. -# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) -# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" export WS_PATH ENGINE echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then source "$WS_PATH/server_vllm.sh" else source "$WS_PATH/server_sglang.sh" fi -<<<<<<< HEAD - -# ============================================================================= -# Model-Specific Configuration from YAML -# ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) - -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') - -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP -else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" -fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" - echo "================================================" - - # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME - # are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 -======= ->>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 6b70014ee..73cad3adc 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -199,29 +199,29 @@ python3 $WS_PATH/sync.py barrier \ # ETCD Server Setup # ============================================================================= -echo "Proceeding to start etcd server on $host_name" -bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -echo "======================================" - -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 +# echo "Proceeding to start etcd server on $host_name" +# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +# etcd_pid=$! + +# echo "Waiting at etcd server barrier on $host_name" +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# echo "All etcd servers are up : $host_name" +# sleep 3 + +# echo "etcd endpoint health==================" +# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +# echo "======================================" + +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 # ============================================================================= # Cluster Topology Configuration @@ -343,7 +343,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "MoRI-IO proxy is ready for benchmarking" + echo "${ROUTER_TYPE} is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -490,9 +490,9 @@ else [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi -echo "Killing the etcd server" -kill $etcd_pid 2>/dev/null || true -pkill -f etcd 2>/dev/null || true +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 8c7a9f07a..589399f74 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -875,11 +875,11 @@ except Exception as e: # Run installers # ============================================================================= -install_ucx -install_rixl -install_etcd -install_libionic -install_mori +# install_ucx +# install_rixl +# install_etcd +# install_libionic +# install_mori install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index 0b1c2b2f6..115e31a15 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -100,7 +100,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # Engine-specific xP/yD semantics and TP exports -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} fi