diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 39e299cb0..74e9eb914 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7714,3 +7714,96 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-vllm:
+  image: vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cw
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  # Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+  # 228febcfe9c76347cd619a7622af83ca52ca35a4. 8k/1k only — PR 84
+  # publishes 5 recipes spanning low-conc (TP=4 decode) → mid (DP=4/8
+  # decode + DP=4 prefill workers) → max (14p1d-dep4-dep16, 18 nodes).
+  # Each recipe rack-pins via its own sbatch_directives.segment.
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low-conc / interactivity: 1 prefill (DP=4 + EP) + 1 decode (TP=4).
+    # 2 nodes total. Decode is plain TP, no EP/DP.
+    - conc-list: [4, 8, 16, 32, 64, 128, 256]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+    # Mid-low: 1 prefill (DP=4) + 1 decode (DP=4 + EP). 2 nodes total.
+    # Decode swings to DP+EP at conc 256/512 to spread the MoE experts.
+    - conc-list: [256, 512]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+    # Mid-high: 6 prefills (DP=4 each) + 1 decode (DP=8 + EP). 10 nodes
+    # per upstream resources block (decode_nodes:4 verbatim from PR 84).
+    - conc-list: [1024, 2048]
+      prefill:
+        num-worker: 6
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # High: 12 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 16 nodes.
+    - conc-list: [3072, 4096]
+      prefill:
+        num-worker: 12
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # Max: 14 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 18 nodes
+    # — fills exactly one cw rack.
+    - conc-list: [6144, 8192]
+      prefill:
+        num-worker: 14
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 60f3299cf..f574c629c 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -139,3 +139,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml
new file mode 100644
index 000000000..4e392d943
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml
@@ -0,0 +1,137 @@
+name: "dsv4-vllm-disagg-gb300-12p1d-dep4-dep16"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. High 8k/1k:
+# 12 prefills (DP=4 each) + 1 wide decode (DP=16). 16 nodes total.
+# Fits within one cw rack (18 nodes).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "16"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 12
+  decode_nodes: 4
+  prefill_workers: 12
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.92
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "3072x4096"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml
new file mode 100644
index 000000000..964730f79
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml
@@ -0,0 +1,137 @@
+name: "dsv4-vllm-disagg-gb300-14p1d-dep4-dep16"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Max 8k/1k:
+# 14 prefills (DP=4 each) + 1 wide decode (DP=16). 18 nodes total —
+# fills exactly one cw rack.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "18"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 14
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.92
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "6144x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml
new file mode 100644
index 000000000..3b30212ad
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml
@@ -0,0 +1,138 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep4-dep4"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid 8k/1k:
+# 1 prefill (DP=4 on 1 node) + 1 decode (DP=4 on 1 node). 2 nodes total.
+# Decode shifts from TP=4 (low conc) to DP=4+EP at conc 256/512 to keep
+# the wide MoE expert spread tight.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.91
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml
new file mode 100644
index 000000000..bd5f303ba
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml
@@ -0,0 +1,137 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep4-tp4"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Low-concurrency 8k/1k:
+# 1 prefill (DP=4 on 1 node) + 1 decode (TP=4 on 1 node). 2 nodes total.
+# Cluster: gb300-cw (CoreWeave, 2x 18-node racks); pinned to one rack
+# via sbatch_directives.segment because cw's srtslurm.yaml turns off
+# srtctl's auto-segment.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "2"
+  # Use full node memory; cw default cgroup is too tight for DSV4 weight load.
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.91
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128x256"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml
new file mode 100644
index 000000000..b3e9cb523
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml
@@ -0,0 +1,138 @@
+name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid-high 8k/1k:
+# 6 prefills (DP=4 each, 1 node each) + 1 wide decode (DP=8). 10 nodes
+# total per upstream resources block (decode_nodes:4 even though one
+# DP=8 worker only needs 2 nodes — preserved verbatim from upstream).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "10"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.92
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0421c5596..2bf844101 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1928,3 +1928,13 @@
     - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 sweep on cluster gb300-cw (CoreWeave; 2x 18-node racks)"
+    - "Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA 228febcf. 5 recipes spanning 8k/1k from c=4 to c=8192: 1p1d-dep4-tp4 (low conc), 1p1d-dep4-dep4 (c512), 6p1d-dep4-dep8 (c2048), 12p1d-dep4-dep16 (c4096), 14p1d-dep4-dep16 (c8192, 18 nodes)"
+    - "Container pinned to vllm/vllm-openai@sha256:d29a90b1... (cu130 + DSV4). Dynamo via published v1.0.2 wheel (install: true). Per-worker tuning: numa-bind, safetensors-load-strategy: prefetch, weight offload (group-size 3), enable-ep-weight-filter, enable-sleep-mode, all2all-backend: flashinfer_nvlink_one_sided on decode, PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True on prefill"
+    - "vLLM patches (auto-applied by upstream configs/vllm-container-deps.sh): cumem expandable_segments fix, MegaMoE free-orig (vllm-project/vllm#40860 backport), nvlink one-sided bf16 fix, numa-bind hash fix"
+    - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each recipe rack-pins via sbatch_directives.segment (cw's srtslurm.yaml turns off srtctl auto-segment)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
new file mode 100755
index 000000000..569cc28ac
--- /dev/null
+++ b/runners/launch_gb300-cw.sh
@@ -0,0 +1,305 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave)
+# cluster. Mirrors launch_gb200-nv.sh but adjusted for cw's filesystem
+# layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe,
+# and the SLURM partition is `all`. cw is 2x 18-node racks; srtctl's
+# auto-segment is disabled (use_segment_sbatch_directive: false) and each
+# recipe pins its own segment via sbatch_directives — the largest
+# topology (14p1d-dep4-dep16, 18 nodes) fills exactly one rack.
+#
+# srt-slurm is checked out at NVIDIA/srt-slurm PR #84 head; that PR ships
+# the dynamo 1.0.2 install path + the vLLM patches the new recipes
+# require, so we use upstream's configs/vllm-container-deps.sh and
+# configs/patches/* unchanged (no local overlay).
+
+set -x
+
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr.
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
+    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; no separate batch queue.
+# Account `cw-sup` is what `sacctmgr show assoc user=$USER` returns on this
+# cluster — `benchmark` (inherited from gb200-nv) does not exist here.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="cw-sup"
+
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env to
+# decide which host driver libraries (libcuda.so.1, libnvidia-*.so) to
+# mount into the container. cw doesn't set them by default — without them
+# the container has no libcuda and `import vllm._C` dies with
+# "libcuda.so.1: cannot open shared object file". SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+NGINX_IMAGE="nginx:1.27.4"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+# The deepseekv4-cu130 vLLM image is pre-staged at /mnt/vast/squash_dupe/
+# (manual upload — enroot import of the ~25 GB image takes too long to
+# repeat each run). nginx is small enough to import on-demand into
+# /mnt/vast/squash/.
+SQUASH_DIR="/mnt/vast/squash"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="/mnt/vast/squash_dupe/vllm_vllm-openai_d29a90b13bb9.sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+if [ ! -f "$SQUASH_FILE" ]; then
+    echo "ERROR: pre-staged vLLM squash not found at $SQUASH_FILE" >&2
+    echo "Re-stage it from docker://$IMAGE or repoint SQUASH_FILE." >&2
+    exit 1
+fi
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+# Pin to NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) head SHA. PR 84
+# carries the configs/patches/* (cumem expandable_segments fix, MegaMoE
+# free_orig, nvlink one-sided bf16 fix, numa-bind hash fix) and the
+# matching configs/vllm-container-deps.sh that wires them up. Released
+# dynamo 1.0.2 wheel + sleep-mode + safetensors prefetch make the
+# prebuild infrastructure unnecessary, so we use upstream's setup
+# script directly — no overlay.
+git fetch origin pull/84/head:pr-84
+git checkout 228febcfe9c76347cd619a7622af83ca52ca35a4
+# Use `cp -rT` so if the upstream branch ever ships a stub
+# `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it
+# rather than nesting (`cp -r src dst` would create
+# `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
+mkdir -p recipes/vllm/deepseek-v4
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+
+echo "Installing srtctl..."
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared
+# NFS across both. srtctl's slurm template (job_script_minimal.j2) does
+# `if ! command -v uv` and skips its own ARM64 install when uv is already
+# on PATH; on compute nodes $HOME/.local/bin is on PATH by default, so a
+# stray x86 binary at $HOME/.local/bin/uv from this runner shadows the
+# template's install and crashes the orchestrator with
+# `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+# Sanity: confirm the install landed where we expect, not in $HOME/.local/bin.
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+# Create srtslurm.yaml for srtctl
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CW
+
+# Default SLURM settings
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "6:00:00"
+
+# Resource defaults
+gpus_per_node: 4
+network_interface: ""
+
+# Path to srtctl repo root (where the configs live)
+srtctl_root: "${SRTCTL_ROOT}"
+
+# Model path aliases
+model_paths:
+  "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# Auto-emission of #SBATCH --segment={total_nodes} is turned off here
+# because each gb300 recipe sets its own segment via sbatch_directives.
+# (Avoid backticks in this comment — heredoc is unquoted, so backtick
+# content would be command-substituted by bash and produce noisy errors.)
+use_segment_sbatch_directive: false
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Override the job name in the config file with the runner name
+sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+# Use the JOB_ID to find the logs directory
+# srtctl creates logs in outputs/JOB_ID/logs/
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+# Wait for log file to appear (also check job is still alive)
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+# Poll for job completion in background
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+# Stream the log file until job completes (-F follows by name, polls instead of inotify for NFS)
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi