SemiAnalysisAI · ichbinblau · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
@@ -838,7 +838,6 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-
 dsr1-fp8-mi355x-sglang-disagg-mtp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -993,6 +992,113 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:v0.18.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:v0.18.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+    # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+    # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3

diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
@@ -1,63 +1,99 @@
 #!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
 
 n_prefill=$1
 n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
 model_path=$5
 model_name=$6
-MODEL_PATH="${model_path}/${model_name}"
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
 log_path=$7
 
 chosen_isl=${8:-1024}
 chosen_osl=${9:-1024}
 concurrency_list=${10:-"512x1"}
-chosen_req_rate=${11:-1}
+if [[ "$ENGINE" == "vllm" ]]; then
+    chosen_req_rate=${11:-inf}
+else
+    chosen_req_rate=${11:-1}
+fi
 random_range_ratio=${12:-0.8}
 num_prompts_multiplier=${13:-10}
 
 IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
 
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
-
-head_node="localhost"
-head_port="30000"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
 
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
 
-profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
+profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-# Repo root inside the container (3 levels up from this script's directory)
 REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
 
-for max_concurrency in ${chosen_concurrencies[@]}; do
+for max_concurrency in "${chosen_concurrencies[@]}"; do
 
     export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
 
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
     echo "profile_folder: $profile_folder"
     echo "max_concurrency: $max_concurrency"
     echo "chosen_req_rate: $chosen_req_rate"
     echo "MODEL_PATH: $MODEL_PATH"
-    echo "head_port: $head_port"
+    echo "ROUTER_PORT: $ROUTER_PORT"
     echo "chosen_isl: $chosen_isl"
     echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
     echo "export_file: $export_file"
 
+    # Engine-specific extra flags
+    extra_flags=""
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        extra_flags="--trust-remote-code"
+    else
+        if [ "$IS_MTP" = "true" ]; then
+            extra_flags="--use-chat-template"
+        fi
+    fi
+
     run_benchmark_serving \
         --bench-serving-dir "$REPO_ROOT" \
-        --model  ${MODEL_PATH} \
-        --port ${head_port} \
+        --model "$MODEL_PATH" \
+        --port "$ROUTER_PORT" \
         --backend openai \
-        --input-len ${chosen_isl} \
-        --output-len ${chosen_osl} \
-        --random-range-ratio ${random_range_ratio} \
-        --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
-        $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
+        $extra_flags
 
     echo "-----------------------------------------"
+
+    # vLLM: cooldown between rounds for idle KV block reaper
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+        sleep 10
+    fi
 done