diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e4177ee8..58cdfe8b9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7606,7 +7606,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         dp-attn: true
 
 dsv4-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:deepseekv4-cu130
+  image: vllm/vllm-openai:v0.20.0-ubuntu2404
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -7615,102 +7615,94 @@ dsv4-fp4-gb200-dynamo-vllm:
   multinode: true
   disagg: true
   seq-len-configs:
-  # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's
-  # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg
-  # at this seq-len yet (PR #67 only publishes 8k/1k).
-  - isl: 1024
+  - isl: 8192
     osl: 1024
     search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
-    # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
-    - conc-list: [1, 4, 8, 16, 32, 64]
+    # Six 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm
+    # aflowers/gb200-dsv4-recipes branch, recipes/vllm/deepseek-v4-pro-sa/
+    # (the SemiAnalysis-curated subset of PR #77). conc-list values match
+    # each recipe's benchmark.concurrencies.
+
+    # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP).
+    # 18 nodes. Multiple TP-only decoders parallelize independent requests.
+    - conc-list: [1, 8, 16, 32, 64, 128, 256, 512]
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml"
       decode:
-        num-worker: 1
+        num-worker: 8
         tp: 8
         ep: 1
         dp-attn: false
-    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
-    # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
-    - conc-list: [128, 256, 1024, 2048, 4096]
+    # 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes.
+    - conc-list: [64, 128, 256, 512, 1024]
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
-    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
-    # The 4096 overlap with the 1p1d block gives a crossover point. 8192
-    # would saturate 1p1d's prefill, so this topology takes over there.
-    - conc-list: [4096, 8192]
+    # 1p4d pure-TP decode: 1 prefill (DEP=8) + 4 decode (TP=8). 10 nodes.
+    - conc-list: [256, 512]
       prefill:
-        num-worker: 3
+        num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml"
       decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
-    - conc-list: [1, 4, 8, 16, 32, 64]
+        num-worker: 4
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # 2p1d DEP-8 decode (c4096): 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes.
+    - conc-list: [4096]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
-        dp-attn: false
-    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-    - conc-list: [512, 1024]
+        ep: 8
+        dp-attn: true
+    # 3p1d DEP-8 decode (c4096): 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes.
+    - conc-list: [4096]
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
-    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-    - conc-list: [4096, 8192]
+    # 3p1d wide DEP-16 decode (c4096): 3 prefill (DEP=8) + 1 decode (DEP=16). 10 nodes.
+    - conc-list: [4096]
       prefill:
-        num-worker: 7
+        num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml"
       decode:
         num-worker: 1
         tp: 16
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
deleted file mode 100644
index bf5b441b9..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,125 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"
-
-# 1k/1k mid-to-high throughput topology. Extrapolated from
-# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's
-# DP>=8 minimum.  Single prefill worker feeding a wide DP=16 decode handles
-# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high
-# enough at this prompt length; see kimi precedent).
-#
-# Differences from our 8k1k 7p1d-dep8-dep16:
-#   * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes
-#   * max-model-len: 3072 instead of auto
-#   * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq)
-#   * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism)
-#   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
-# a slow first-time Lustre load + cudagraph capture can't get cut off by the
-# SLURM wall clock.
-slurm:
-  time_limit: "8:00:00"
-
-# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
-# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
-# run hit the default 1800s. Make this *very* generous since the cost of an
-# over-long deadline is just sitting idle, not wasted compute.
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 4
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 3072
-      max-num-seqs: 512
-      max-cudagraph-capture-size: 512
-      max-num-batched-tokens: 512
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x1024x2048x4096"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
deleted file mode 100644
index 63e9e280c..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
-
-# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
-# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling)
-# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s
-# exceeds what one DP=8 worker can sustain.
-#
-# Decode capacity:
-#   max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which
-#   leaves headroom over the conc=8192 working set (per-rank avg 512).
-#   max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is
-#   ~512 so cudagraphs still apply at steady state.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 3072
-      max-num-seqs: 1024
-      max-cudagraph-capture-size: 512
-      max-num-batched-tokens: 1024
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
similarity index 58%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
index 984c79526..ab6d27cb7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
@@ -1,24 +1,28 @@
-name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8"
 
-# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from
-# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only
-# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No
-# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet.
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
 #
-# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
-# very low concurrency (1-64).
+# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes total. -sa
+# variant extends concurrencies to 64x128x256x512x1024.
 #
-# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see
-# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list.
-
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
   precision: "fp4"
 
 dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
+  wheel: "1.2.0.dev20260426"
 
 setup_script: vllm-container-deps.sh
 
@@ -28,7 +32,6 @@ slurm:
 health_check:
   max_attempts: 1440
   interval_seconds: 10
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
@@ -38,17 +41,13 @@ resources:
   decode_workers: 1
   gpus_per_prefill: 8
   gpus_per_decode: 8
-
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-
 backend:
   type: vllm
   connector: null
-
   prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -57,30 +56,27 @@ backend:
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
     NCCL_P2P_LEVEL: NVL
-
   decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
     NCCL_P2P_LEVEL: NVL
-
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -92,7 +88,7 @@ backend:
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: 3072
+      max-model-len: 9280
       max-num-seqs: 16
       max-num-batched-tokens: 32768
       trust-remote-code: true
@@ -103,42 +99,46 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      # CPU/DRAM expert offload — required for fit. Without these the prefill
-      # rank reports `Available KV cache memory: -16 GiB` and the engine
-      # refuses to start. Numa-bind from upstream is still off because our
-      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
-      # vllm_numa_bind_hash_fix.py patch.
+      numa-bind: true
       offload-group-size: 3
       offload-num-in-group: 1
       offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
       tokenizer-mode: deepseek_v4
-
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 3072
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
+      max-model-len: 9280
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
-
 benchmark:
   type: "sa-bench"
-  isl: 1024
+  isl: 8192
   osl: 1024
-  concurrencies: "1x4x8x16x32x64"
+  concurrencies: "64x128x256x512x1024"
   req_rate: "inf"
-  use_chat_template: false
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
new file mode 100644
index 000000000..3864fec47
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
@@ -0,0 +1,144 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload"
+
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
+#
+# Topology: 1 prefill (DEP=8) + 4 decode (pure TP=8). 10 nodes.
+# Targets c256-c512 with TP-only decoders.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml
new file mode 100644
index 000000000..b40f89d1c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml
@@ -0,0 +1,144 @@
+name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload"
+
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml
+#
+# Topology: 1 prefill (DEP=8) + 8 decode (pure TP=8). 18 nodes.
+# Targets c8-c512 with parallel TP-only decoders.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 16
+  prefill_workers: 1
+  decode_workers: 8
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x8x16x32x64x128x256x512"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
similarity index 54%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
index 0c872e9c4..9848edb01 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
@@ -1,37 +1,28 @@
-name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
+name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload"
 
-# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch:
-#   recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
 #
-# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
-# very low concurrency (1-64) where TEP-style decode (TP-sharded
-# attention + EP'd experts within one worker) gives the best per-user
-# latency.
+# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes.
+# c4096-tuned variant (decode max-num-seqs=512).
 #
 # Local deltas vs upstream:
 #   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     our launch script's SRT_SLURM_MODEL_PREFIX.
-#   * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026
-#     which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM
-#     expert offload (offload-group-size/-num-in-group/-prefetch-step) is
-#     KEPT — it's load-bearing here, see the comment in vllm_config.prefill.
-#   * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode
-#     dropped. Both require PR #68 sa-bench tokenizer support that our
-#     pinned srtctl version doesn't have. The recipe-level
-#     `tokenizer-mode: deepseek_v4` for workers stays.
-#   * Container kept on the floating tag (`:deepseekv4-cu130`) instead of
-#     the upstream sha256 pin.
-#   * health_check / slurm.time_limit added — we observed cold-cache
-#     Lustre loads exceeding the default 1800s deadline.
-
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
   precision: "fp4"
 
 dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
+  wheel: "1.2.0.dev20260426"
 
 setup_script: vllm-container-deps.sh
 
@@ -41,27 +32,22 @@ slurm:
 health_check:
   max_attempts: 1440
   interval_seconds: 10
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 4
   decode_nodes: 2
-  prefill_workers: 1
+  prefill_workers: 2
   decode_workers: 1
   gpus_per_prefill: 8
   gpus_per_decode: 8
-
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-
 backend:
   type: vllm
   connector: null
-
   prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -70,30 +56,27 @@ backend:
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
     NCCL_P2P_LEVEL: NVL
-
   decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
     NCCL_P2P_LEVEL: NVL
-
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -105,7 +88,7 @@ backend:
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: 9280
+      max-model-len: 16384
       max-num-seqs: 16
       max-num-batched-tokens: 32768
       trust-remote-code: true
@@ -116,42 +99,46 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      # CPU/DRAM expert offload — required for fit. Without these the prefill
-      # rank reports `Available KV cache memory: -16 GiB` and the engine
-      # refuses to start. Numa-bind from upstream is still off because our
-      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
-      # vllm_numa_bind_hash_fix.py patch.
+      numa-bind: true
       offload-group-size: 3
       offload-num-in-group: 1
       offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
       tokenizer-mode: deepseek_v4
-
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 9280
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
-
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1x4x8x16x32x64"
+  concurrencies: "4096"
   req_rate: "inf"
-  use_chat_template: false
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
similarity index 50%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
index d6b750bf2..3f3803d3b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
@@ -1,20 +1,28 @@
-name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
-
-# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). Targets conc 512-1024 where a single big decode
-# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
-# reference (PR #67); only resources, prefill_workers count, and
-# benchmark concurrencies differ. Decode capacity matches 7p1d
-# (max-num-seqs=256) since the decode topology itself is identical.
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload"
 
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
+#
+# Topology: 3 prefill (DEP=8) + 1 wide decode (DEP=16). 10 nodes.
+# c4096-tuned variant.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
   precision: "fp4"
 
 dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
+  wheel: "1.2.0.dev20260426"
 
 setup_script: vllm-container-deps.sh
 
@@ -24,7 +32,6 @@ slurm:
 health_check:
   max_attempts: 1440
   interval_seconds: 10
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
@@ -34,15 +41,12 @@ resources:
   decode_workers: 1
   gpus_per_prefill: 8
   gpus_per_decode: 16
-
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-
 backend:
   type: vllm
   connector: null
-
   prefill_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
@@ -50,7 +54,15 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
@@ -58,7 +70,13 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -70,17 +88,23 @@ backend:
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
@@ -90,7 +114,7 @@ backend:
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: auto
+      max-model-len: 16384
       max-num-seqs: 256
       max-cudagraph-capture-size: 256
       max-num-batched-tokens: 256
@@ -102,11 +126,19 @@ backend:
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-
+      tokenizer-mode: deepseek_v4
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "512x1024"
+  concurrencies: "4096"
   req_rate: "inf"
-  use_chat_template: false
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
new file mode 100644
index 000000000..f3b09e0db
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
@@ -0,0 +1,144 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload"
+
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77):
+#   recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
+#
+# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes.
+# c4096-tuned variant (decode max-num-seqs=512).
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
+#     match nvidia-master.yaml image (which the launch script registers as
+#     the alias key in srtslurm.yaml). Upstream variants ship either the
+#     non-dynamo floating tag or a sha256 pin.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  container:
+    image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
+    vllm: "0.20.0"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
deleted file mode 100644
index 6213373b3..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,122 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
-
-# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
-# benchmark flag: use_chat_template=false. The HF tokenizer for
-# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
-# --use-chat-template path calls tokenizer.apply_chat_template() and raises
-# ValueError. Throughput benchmarking uses /v1/completions with random tokens
-# anyway — no chat template needed.
-#
-# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
-# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
-# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
-# uses this native formatter — no custom Jinja template required.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
-# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
-# shards with 14 prefill workers contending for the same OSTs. The first
-# bump to 7200s was still insufficient in one case, so pad generously to
-# 14400s (4h). Over-long deadline only costs idle time, not compute.
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 14
-  decode_nodes: 4
-  prefill_workers: 7
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8941211c1..b161e9b95 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1937,3 +1937,12 @@
     - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/gb200-dsv4-recipes (PR #77, supersedes #71)"
+    - "8k/1k search-space expanded from 3 topologies to 8: adds 1p4d/1p8d pure-TP-decode (offload), 1p1d/2p1d/3p1d DEP-8 decode, and a 3p1d-dep16-40 wide-decode shape"
+    - "Drops local workarounds: numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode are restored now that PR #77 ships vllm_numa_bind_hash_fix.py and sa-bench DSV4 tokenizer support"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163
+
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 224c3a928..bbc9b22af 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -43,10 +43,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
-        # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
-        # model.path alias in our DSV4 recipes.
-        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 recipes.
+        export MODEL_PATH="/mnt/lustre01/models/deepseek-v4-pro"
         export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
@@ -143,7 +141,7 @@ fi
 if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
+    git checkout aflowers/vllm-gb200-v0.20.0
     # Use `cp -rT` so if the upstream branch ever ships a stub
     # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto
     # it rather than nesting (`cp -r src dst` would create