diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 39e299cb0..3f905d3c8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7714,3 +7714,96 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-sglang:
+  # _arm64 variant: GH runner pod doing `enroot import` is amd64, but
+  # gb300-cw compute nodes are aarch64 (Grace). Without the explicit
+  # arm64 tag the registry serves the amd64 manifest, which fails to
+  # exec on the compute side.
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cw
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  # Five disagg topologies from NVIDIA/srt-slurm PR #85 branch
+  # recipes/dsv4-agg-disagg, overlaid with cw-specific fields by
+  # launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave (2x 18-node
+  # racks); recipes set their own sbatch_directives.segment for rack
+  # pinning. All use NIXL KV transfer.
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P1D TP=4 MXFP4 — low-latency baseline (2 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-tp4-mxfp4.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+    # 1P1D DEP4 mega_moe — TEP disagg (2 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-dep4-mega-moe.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+    # 1P2D asymmetric DEP4->DEP8 mega_moe — best per-GPU efficiency (3 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p2d-dep4-to-dep8-mega-moe.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 2P2D symmetric DEP8 mega_moe — largest throughput (4 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-dep8-mega-moe.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 2P2D TP=8 MXFP4 — TP-only 4-node baseline (4 nodes)
+    - conc-list: [4, 8, 16, 32, 64, 128, 256, 512]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-tp8-mxfp4.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 60f3299cf..f574c629c 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -139,3 +139,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml
new file mode 100644
index 000000000..72baef909
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep4-mega-moe.yaml
@@ -0,0 +1,128 @@
+name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml
new file mode 100644
index 000000000..a0b60a00b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-tp4-mxfp4.yaml
@@ -0,0 +1,86 @@
+name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml
new file mode 100644
index 000000000..569373509
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p2d-dep4-to-dep8-mega-moe.yaml
@@ -0,0 +1,127 @@
+name: "dsv4-pro-gb300-disagg-1p2d-dep4-to-dep8-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "3"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 2048
+      cuda-graph-max-bs: 2048
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml
new file mode 100644
index 000000000..8d82d58cb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-dep8-mega-moe.yaml
@@ -0,0 +1,126 @@
+name: "dsv4-pro-gb300-disagg-2p2d-dep8-mega-moe-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "4"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.83
+      max-running-requests: 2048
+      cuda-graph-max-bs: 2048
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml
new file mode 100644
index 000000000..1b697d826
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p2d-tp8-mxfp4.yaml
@@ -0,0 +1,98 @@
+name: "dsv4-pro-gb300-disagg-2p2d-tp8-mxfp4-1k1k"
+
+dynamo:
+  install: false
+
+setup_script: gb300-cw-sglang-container-deps.sh
+
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
+sbatch_directives:
+  segment: "4"
+  mem: "0"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a29c278f2..0b3b0b04f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1877,7 +1877,7 @@
     - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
     - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
-  
+
 - config-keys:
     - dsv4-fp4-b200-sglang
   description:
@@ -1918,3 +1918,12 @@
     - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
     - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)"
+    - "Topology: 1P + 1D, both TP=4 on a single GB300; MXFP4 MoE kernels, NIXL KV transfer"
+    - "Recipe ported from NVIDIA/srt-slurm PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169
diff --git a/runners/gb300-cw-sglang-container-deps.sh b/runners/gb300-cw-sglang-container-deps.sh
new file mode 100755
index 000000000..348c436ef
--- /dev/null
+++ b/runners/gb300-cw-sglang-container-deps.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Custom container-deps installer for gb300-cw + sglang. pip-installs
+# dynamo from a wheel + source archive that launch_gb300-cw.sh pre-built
+# on /mnt/vast BEFORE submitting sbatch.
+#
+# Why the prebuild design (mirrors the vllm sibling at
+# gb300-cw-vllm-container-deps.sh from PR #1150):
+#   srt-slurm's per-rank install path runs `maturin build` inside every
+#   container srtctl srun's. The lmsysorg/sglang:deepseek-v4-grace-
+#   blackwell_arm64 image lacks rust pre-installed, so the per-rank
+#   build path can't run; pinning a published dev wheel (1.2.0.dev*)
+#   trips API drift against the bundled sglang 0.5.9 (compat shim
+#   warning + disagg startup warmup hang — see runs ending 2026-04-27).
+#   Building dynamo ONCE from hash 6a159fed (the same commit the gb200
+#   vllm recipe pins, known to be sglang-API-stable) on a single-node
+#   srun in launch_gb300-cw.sh sidesteps both: every rank pip-installs
+#   from the cache here (~30 s, no contention).
+#
+#   Used in tandem with `dynamo.install: false` in the gb300-cw sglang
+#   recipes so srt-slurm's hardcoded install path is skipped and this
+#   script is the sole installer.
+
+set -e
+
+DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}"
+CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH"
+DONE_MARKER="$CACHE_DIR/.done"
+
+if [ ! -f "$DONE_MARKER" ]; then
+    echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2
+    echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2
+    exit 1
+fi
+
+echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR"
+pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
+
+rm -rf /tmp/dynamo_build
+mkdir -p /tmp/dynamo_build/dynamo
+tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo
+cd /tmp/dynamo_build/dynamo
+pip install --break-system-packages -e .
+
+echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
+
+# --- NIXL DSv4 state-buffer patch: sglang PR #23773 --------------------------
+# The disagg recipes use NIXL KV transfer. Without this patch, NIXL
+# silently drops auxiliary state buffers (SWA / NSA / Mamba), causing
+# decode-side accuracy to collapse on DSv4-Pro. The patch mirrors what
+# the Mooncake backend already does. See NVIDIA/srt-slurm PR #85 README.
+SGLANG_DIR="${SGLANG_DIR:-/sgl-workspace/sglang}"
+SGLANG_REMOTE="https://github.com/sgl-project/sglang.git"
+SGLANG_PR_NUMBER="23773"
+SGLANG_PR_REF="refs/pull/${SGLANG_PR_NUMBER}/head"
+SGLANG_LOCAL_BRANCH="nixl-dsv4-pr-${SGLANG_PR_NUMBER}"
+
+echo "=== Installing SGLang NIXL DSV4 fix from PR #${SGLANG_PR_NUMBER} ==="
+
+if [ -d "$SGLANG_DIR/.git" ]; then
+    cd "$SGLANG_DIR"
+    git config --global --add safe.directory "$SGLANG_DIR" 2>/dev/null || true
+    if git remote get-url origin >/dev/null 2>&1; then
+        git remote set-url origin "$SGLANG_REMOTE"
+    else
+        git remote add origin "$SGLANG_REMOTE"
+    fi
+    git fetch --depth 1 origin "$SGLANG_PR_REF"
+    git checkout -f -B "$SGLANG_LOCAL_BRANCH" FETCH_HEAD
+    echo "Checked out SGLang PR #${SGLANG_PR_NUMBER} at $(git rev-parse HEAD)"
+else
+    echo "WARNING: $SGLANG_DIR/.git not found; skipping NIXL patch (container may already include fix)"
+fi
+
+# --- API-drift patch: dynamo 1.1.0 vs sglang 0.5.9 --------------------------
+# ai-dynamo at hash 6a159fed (1.1.0-equivalent) calls
+# `engine.async_generate(return_routed_experts=...)`, but the sglang 0.5.9
+# bundled in lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 has an
+# Engine.async_generate signature that doesn't accept that kwarg, so every
+# request 500s with:
+#   TypeError: Engine.async_generate() got an unexpected keyword argument
+#       'return_routed_experts'
+# (See run 24973148979 → mooncake unblocked the disagg warmup; this is the
+# next failure layer.) Strip the kwarg from every call site in the
+# extracted dynamo source. `pip install -e .` above is editable, so the
+# patch propagates immediately at next `python3 -m dynamo.sglang ...`.
+DYNAMO_SRC=/tmp/dynamo_build/dynamo
+patch_targets=$(grep -rl 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || true)
+if [ -n "$patch_targets" ]; then
+    # Match WHOLE LINES that are just a kwarg pass:
+    #     return_routed_experts=<simple-identifier-or-attr>,?
+    # The value is constrained to a simple identifier ([A-Za-z_][\w.]*),
+    # which deliberately excludes function calls (no `(` allowed). This
+    # leaves the multi-line assignment statement at decode_handler.py:275
+    # intact:
+    #     return_routed_experts = getattr(
+    #         self.config.server_args, "enable_return_routed_experts", False
+    #     )
+    # That assignment is dead code after we strip the kwarg passes, but
+    # leaving it costs nothing and avoids the syntax-error trap from the
+    # earlier (over-greedy) version of this patch.
+    for f in $patch_targets; do
+        echo "[dynamo-patch] stripping return_routed_experts kwarg lines in $f"
+        python3 - "$f" <<'PYEOF'
+import re, sys
+path = sys.argv[1]
+with open(path) as fh:
+    src = fh.read()
+# Whole-line kwarg pass: indented `return_routed_experts=<simple>,?` then EOL.
+# `[A-Za-z_][\w.]*` matches identifiers, attribute access, True/False/None — but NOT calls.
+new = re.sub(
+    r'^[ \t]+return_routed_experts\s*=\s*[A-Za-z_][\w.]*\s*,?[ \t]*\n',
+    '',
+    src,
+    flags=re.MULTILINE,
+)
+if new != src:
+    with open(path, 'w') as fh:
+        fh.write(new)
+    print(f'[dynamo-patch]   patched: {path}')
+else:
+    print(f'[dynamo-patch]   no kwarg-pass lines matched in: {path}')
+PYEOF
+    done
+    # Sanity: any remaining occurrence is fine if it's the assignment;
+    # log it so the next person knows what's left.
+    echo "[dynamo-patch] residual occurrences (expected: only the dead assignment in decode_handler.py):"
+    grep -rn 'return_routed_experts' "$DYNAMO_SRC" --include='*.py' 2>/dev/null || echo "  (none)"
+else
+    echo "[dynamo-patch] no occurrences of return_routed_experts found in $DYNAMO_SRC (already patched or moved upstream)"
+fi
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
new file mode 100755
index 000000000..edbb55375
--- /dev/null
+++ b/runners/launch_gb300-cw.sh
@@ -0,0 +1,360 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw
+# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in
+# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). Compared to
+# that script, the SGLang flow is simpler: no dynamo wheel prebuild and
+# no vllm-container-deps.sh override, because the SGLang recipes pin
+# `dynamo.version: 0.8.1` and srtctl pip-installs from PyPI per rank.
+
+set -x
+
+if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local
+    # NVMe on cw. SRT_SLURM_MODEL_PREFIX matches the model.path alias in
+    # benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/.
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
+    export SRT_SLURM_MODEL_PREFIX="dsv4-pro"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; account `cw-sup` is
+# what `sacctmgr show assoc user=$USER` returns there. `benchmark`
+# (inherited from gb200-nv) does not exist on cw.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="cw-sup"
+
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env
+# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so)
+# to mount into the container. cw doesn't set them by default — without
+# them the container has no libcuda and CUDA init fails. SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+NGINX_IMAGE="nginx:1.27.4"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
+# pyxis rejects '+' in image paths with "Invalid image format", and the
+# old /mnt/vast/squash dir contains '+'-separated files from prior runs.
+SQUASH_DIR="/mnt/vast/squash_dupe"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+enroot import -o $SQUASH_FILE docker://$IMAGE
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting
+# the main sbatch. The lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
+# image lacks a working ai-dynamo (install: false → ModuleNotFoundError),
+# and pinning a published dev wheel (1.2.0.dev*) trips API drift against
+# the bundled sglang 0.5.9 (compat shim warns then disagg startup warmup
+# hangs — see runs ending 2026-04-27). Building from hash 6a159fed (the
+# same commit the gb200 vllm sibling pins, known sglang-API-stable) on
+# a single dedicated srun eliminates per-rank coordination on /mnt/vast
+# (NFS flock is unreliable). Same pattern as PR #1150's vllm launcher.
+DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b"
+DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache"
+DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH"
+DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done"
+mkdir -p "$DYNAMO_CACHE_ROOT"
+
+if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
+    echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..."
+    # Build into a unique temp dir, then atomically mv into place. Two
+    # concurrent runners may both build; the first to finish the rename
+    # wins, the loser cleans up. Same-directory rename() is atomic on
+    # NFS (unlike flock).
+    TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX")
+    # --mem=0: claim full node memory. Default cgroup is much smaller and
+    # rustc's link phase can OOM otherwise. CARGO_BUILD_JOBS=8 caps
+    # parallelism so peak rustc memory stays bounded on a 72-core Grace
+    # node, and `-C debuginfo=0` cuts per-process memory further.
+    srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \
+         --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \
+         --job-name="${RUNNER_NAME}-prebuild" \
+         --container-image="$SQUASH_FILE" \
+         --no-container-entrypoint --no-container-mount-home \
+         --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \
+         bash -c "
+            set -e
+            apt-get update -qq
+            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
+            if ! command -v cargo &>/dev/null; then
+              curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+              . \$HOME/.cargo/env
+            fi
+            if ! command -v maturin &>/dev/null; then
+              pip install --break-system-packages maturin
+            fi
+            rm -rf /tmp/dynamo_build
+            mkdir -p /tmp/dynamo_build
+            cd /tmp/dynamo_build
+            git clone https://github.com/ai-dynamo/dynamo.git
+            cd dynamo
+            git checkout $DYNAMO_HASH
+            cd lib/bindings/python/
+            export CARGO_BUILD_JOBS=8
+            export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable'
+            maturin build -o '$TEMP_BUILD'
+            cd /tmp/dynamo_build/dynamo
+            tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \
+                --exclude='lib/bindings/python/target' \
+                --exclude='.git' \
+                .
+            touch '$TEMP_BUILD/.done'
+        "
+    if [ -f "$TEMP_BUILD/.done" ]; then
+        # Atomic publish. If another runner already published, mv fails
+        # and we just discard our copy.
+        if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then
+            echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR"
+        else
+            echo "[dynamo-prebuild] another runner published first, discarding our copy"
+            rm -rf "$TEMP_BUILD"
+        fi
+    else
+        echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2
+        rm -rf "$TEMP_BUILD"
+        exit 1
+    fi
+else
+    echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR"
+fi
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits
+# hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+git checkout recipes/dsv4-agg-disagg
+
+# Overlay our cw-adapted DSv4 SGLang disagg recipes onto the upstream
+# recipes from PR #85. The upstream recipes at
+# recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/ don't carry
+# cw-specific fields (dynamo.install, setup_script, extra_mount,
+# sbatch_directives), so we overlay locally-maintained copies that add
+# those. `cp -rT` replaces the upstream files in place.
+mkdir -p recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k" recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp
+
+# Drop our cache-installer setup_script next to upstream's configs.
+# Recipes reference it via `setup_script: gb300-cw-sglang-container-deps.sh`
+# alongside `dynamo.install: false` so srtctl skips its own pip install
+# and this script (force-reinstalling from /mnt/vast/dynamo_cache) is the
+# sole installer per rank.
+cp "$GITHUB_WORKSPACE/runners/gb300-cw-sglang-container-deps.sh" configs/gb300-cw-sglang-container-deps.sh
+chmod +x configs/gb300-cw-sglang-container-deps.sh
+
+echo "Installing srtctl..."
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
+# shared NFS across both. srtctl's slurm template (job_script_minimal.j2)
+# does `if ! command -v uv` and skips its own ARM64 install when uv is
+# already on PATH; on compute nodes $HOME/.local/bin is on PATH by
+# default, so a stray x86 binary at $HOME/.local/bin/uv from this
+# runner shadows the template's install and crashes the orchestrator
+# with `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CW (SGLang)
+
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "6:00:00"
+
+gpus_per_node: 4
+network_interface: ""
+
+srtctl_root: "${SRTCTL_ROOT}"
+
+model_paths:
+  "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  dsv4-grace-blackwell: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx: ${NGINX_SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# Auto-emission of #SBATCH --segment={total_nodes} is turned off here
+# because each gb300 recipe sets its own segment via sbatch_directives
+# (rack-pinning on cw's 2x18-node racks).
+use_segment_sbatch_directive: false
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Override the job name in the config file with the runner name
+sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi