diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..de0db09d5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2461,6 +2461,61 @@ dsv4-fp8-h200-vllm:
     search-space:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
 
+# DeepSeek-V4-Pro H100 disaggregated multinode via dynamo-vllm.
+# 2 prefill nodes + 2 decode nodes = 32 H100s total (fills h100-multinode pool).
+# Cross-node TP=16 over IB: DSV4-Pro per-rank weight footprint at DP=16/EP=16
+# is 74.99 GiB on H100 80GB (run 24923521075 OOM'd in sparse_attn_indexer
+# profile_run with only ~4 GiB headroom). TP=16 shards the model 16-way
+# across the 2 nodes, dropping per-rank weights to ~5 GiB. Recipe bundled
+# locally at benchmarks/multi_node/srt_slurm_recipes/ until upstreamed.
+dsv4-fp8-h100-dynamo-vllm:
+  image: vllm/vllm-openai:deepseekv4-cu129
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: h100-multinode
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  # 1P+1D TEP=16 across the full 4-node h100-multinode pool.
+  # Each prefill/decode worker spans 2 nodes via Dynamo's --headless
+  # secondary-node mode + vLLM's MultiprocExecutor + torch.distributed PG.
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [4, 8, 16, 32, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - conc-list: [4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+        additional-settings:
+        # benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: false
+
 # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300
 # pareto sweep. The single-node schema has no explicit data-parallel-size
 # field, so dp-attn=true is used as the existing vLLM script switch for DP4
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
new file mode 100644
index 000000000..049fa8218
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -0,0 +1,183 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16)
+#
+# DSV4-Pro FP8 weights are 74.99 GiB per DP rank — DP=16/EP=16 (one process
+# per GPU) leaves only ~4 GiB headroom on H100 80GB and sparse_attn_indexer
+# OOMs during profile_run (run 24923521075 hit this on every prefill and
+# decode worker simultaneously). Switching to cross-node TP=16 shards
+# dense+expert weights 16-way → ~5 GiB/GPU, plenty of headroom.
+#
+# Layout:
+# - 1 prefill endpoint × TP=16 across 2 nodes (16 GPUs)
+# - 1 decode endpoint  × TP=16 across 2 nodes (16 GPUs)
+# - Total: 32 GPUs across 4 nodes (fills h100-multinode pool)
+#
+# How the cross-node launch works:
+# - srt-slurm's standard TP mode (no data-parallel-size) starts one srun
+#   per node, each process gets all 8 local GPUs.
+# - Leader (node_rank=0): full Dynamo + vLLM with `--master-addr <leader_ip>
+#   --nnodes 2 --node-rank 0 --tensor-parallel-size 16`. MultiprocExecutor
+#   spawns 8 local workers, then waits for the 8 remote workers.
+# - Secondary (node_rank=1): adds `--headless` (srt-slurm vllm.py:386-388),
+#   which routes through dynamo's run_dynamo_headless → vLLM's run_headless →
+#   MultiprocExecutor(monitor_workers=False) joining the leader's PG over
+#   torch.distributed (master_addr / master_port). NCCL backs the TP
+#   all-reduce; on h100-multinode that flows over IB (no NVLink between
+#   nodes), so per-layer TP comms is the dominant latency cost — accepted
+#   as the only way to fit DSV4-Pro on 80 GB H100.
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+# Pin ai-dynamo to a commit whose vllm.inputs imports match the DSV4 vLLM
+# wheel AND that exposes --headless via run_dynamo_headless (see
+# components/src/dynamo/vllm/main.py:80). Requires the alec-flowers/srt-slurm
+# fork (NVIDIA/srt-slurm#71), which extends the dynamo config to accept
+# `hash` as well as `version`.
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+# Bumped from the srt-slurm 6h default. Cross-node TP=16 over IB plus
+# cudagraph capture extends post-load init noticeably; cold-cache Lustre
+# weight load alone took 24 min in run 24923521075.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s (180 attempts) default to 4 hours. Run 24923521075
+# observed full weight load taking ~1450s with three concurrent matrix
+# jobs starving the same Lustre OSTs. Cross-node TP setup + indexer
+# warm-up adds more on top. Match the GB200 dsv4 recipes; the cost of an
+# over-long deadline is sitting idle, not wasted compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  # VLLM_USE_NCCL_SYMM_MEM routes expert-parallel all2all through NCCL
+  # symmetric memory instead of NVSHMEM IPC sockets. The DSV4 cu129 vLLM
+  # wheel's NVSHMEM fails IPC bootstrap on our H100 nodes (mem_heap.cpp
+  # "Fatal IPC Failure" right after weight load). NCCL_CUMEM_ENABLE is
+  # the companion flag.
+  #
+  # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True reduces fragmentation
+  # in the tight headroom — even with TP=16 sharding the model 16-way, a
+  # contiguous 512 MiB indexer scratch can still fail under fragmented
+  # 80 GB caches. The OOM error message itself recommends this exact flag.
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  vllm_config:
+    # tensor-parallel-size: 16 across 2 nodes (each srt-slurm srun process
+    # gets 8 local GPUs; vLLM forms a 16-rank PG via the master_addr
+    # handshake). data-parallel-size is intentionally absent — its presence
+    # would push srt-slurm into the per-GPU process layout, which is what
+    # broke the previous TEP=16 attempt with "World size (16) > available
+    # GPUs (1) in this node".
+    #
+    # enable-expert-parallel keeps experts sharded EP=16 along the same
+    # 16 ranks (each rank holds 1/16 of the routed experts). Communication
+    # is dominated by per-layer TP all-reduce + EP all-to-all, both over
+    # IB.
+    #
+    # enforce-eager on both sides for the first cross-node attempt: cudagraph
+    # capture across nodes is fragile (FULL_DECODE_ONLY graphs include the
+    # cross-node TP all-reduce) and the previous run's 1.48 GiB private-pool
+    # accumulation already burned the headroom. Drop graphs to ship; revisit
+    # for decode performance once the server is observed healthy.
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      # With TP=16 the per-rank model footprint drops from ~75 GiB to
+      # ~5 GiB, so we can match the H200 single-node 0.95 utilization.
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  # DSV4-Pro's HF tokenizer ships with no chat_template attribute. The
+  # server uses --tokenizer-mode deepseek_v4 to handle templating itself,
+  # but sa-bench's local apply_chat_template path raises ValueError.
+  # Send raw prompts; the server handles formatting.
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
new file mode 100644
index 000000000..e31e8d531
--- /dev/null
+++ b/benchmarks/multi_node/srt_slurm_recipes/vllm/deepseek-v4-pro/8k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
@@ -0,0 +1,120 @@
+# DeepSeek-V4-Pro Disaggregated with vLLM (H100 1P+1D / 32-GPU, cross-node TP=16) — 8k/1k
+#
+# Same engine flags as the 1k1k variant. Only the benchmark block differs
+# (ISL=8192, tighter concurrency sweep due to larger prefill work).
+#
+# See recipes/vllm/deepseek-v4-pro/1k1k/disagg-h100-fp8-1p1d-tep16-tep16.yaml
+# for the full rationale (cross-node TP=16 layout, MultiprocExecutor +
+# torch.distributed PG handshake, --headless secondary node, IB-backed
+# NCCL TP all-reduce, deepseek_v4 parsers, NixlConnector).
+
+name: "deepseek-v4-pro-vllm-disagg-h100-1p1d-tep16-tep16-8k1k"
+
+model:
+  path: "dsv4-fp8"
+  container: "vllm/vllm-openai:deepseekv4-cu129"
+  precision: "fp8"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 16
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: nixl
+
+  prefill_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    VLLM_USE_DEEP_GEMM: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_SERVER_DEV_MODE: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    GLOO_SOCKET_IFNAME: "eth0"
+    PYTHONUNBUFFERED: "1"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8"
+      block-size: 256
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      tensor-parallel-size: 16
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      gpu-memory-utilization: 0.95
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 397da6591..e26c5b4f7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1819,3 +1819,13 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
+
+- config-keys:
+    - dsv4-fp8-h100-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP8 H100 multinode disagg benchmark via dynamo-vllm"
+    - "2 prefill nodes + 2 decode nodes (32 H100s total, DP16/EP16 per side)"
+    - "Image: vllm/vllm-openai:deepseekv4-cu129"
+    - "Engine flags match H200 single-node recipe (deepseek_v4 tokenizer/parsers, FP8 KV cache, block size 256, prefix caching disabled)"
+    - "max-model-len 16384 (H100 80GB KV headroom; H200's 800k does not fit across 2 decode nodes)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1142
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 5a2ab64d2..26eabad89 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -29,8 +29,29 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             echo "Unsupported model prefix/precision for dynamo-trt: $MODEL_PREFIX/$PRECISION"
             exit 1
         fi
+    elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp8" ]]; then
+            export MODEL_PATH="/mnt/nfs/lustre/models/dsv4-fp8"
+            export SERVED_MODEL_NAME="deepseek-ai/DeepSeek-V4-Pro"
+            export SRT_SLURM_MODEL_PREFIX="dsv4-fp8"
+            # NVIDIA/srt-slurm@sa-submission-q2-2026 installs ai-dynamo 1.0.1,
+            # which imports vllm.inputs.data.TokensPrompt — a path the DSV4
+            # vLLM wheel has removed. Switch to alec-flowers' fork (head of
+            # https://github.com/NVIDIA/srt-slurm/pull/71) which supports
+            # dynamo.hash pinning so the recipe can pick a dynamo commit
+            # compatible with the DSV4 vllm.inputs layout. Matches PR #1129
+            # on GB200.
+            export SRT_SLURM_REPO_URL="https://github.com/alec-flowers/srt-slurm.git"
+            export SRT_SLURM_REF="d60e3f1c7921721e52af01afaab59a70a1631106"
+        else
+            echo "Unsupported model prefix/precision for dynamo-vllm: $MODEL_PREFIX/$PRECISION"
+            exit 1
+        fi
+        # Verify the weights are staged and log their size (catches partial
+        # downloads / wrong revisions before we burn 8 min on weight load).
+        du -sh "$MODEL_PATH" 2>/dev/null || echo "WARNING: could not stat $MODEL_PATH"
     else
-        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang"
+        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm"
         exit 1
     fi
 
@@ -41,9 +62,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         rm -rf "$SRT_REPO_DIR"
     fi
 
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    git clone "${SRT_SLURM_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}" "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
+    git checkout "${SRT_SLURM_REF:-sa-submission-q2-2026}"
+
+    # Overlay any in-repo srt-slurm recipes onto the clone. Kept here until
+    # the upstream PR lands; cp -r merges directories on GNU cp.
+    LOCAL_RECIPES_DIR="$GITHUB_WORKSPACE/benchmarks/multi_node/srt_slurm_recipes"
+    if [ -d "$LOCAL_RECIPES_DIR" ]; then
+        echo "Overlaying local srt-slurm recipes from $LOCAL_RECIPES_DIR"
+        cp -r "$LOCAL_RECIPES_DIR"/* recipes/
+    fi
 
     echo "Installing srtctl..."
     export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin"
@@ -78,6 +107,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#)
         CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|')
         SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh"
+    elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        # vLLM container mapping - IMAGE is a Docker Hub reference (no registry prefix swap)
+        CONTAINER_KEY="$IMAGE"
+        SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh"
     fi
 
     export ISL="$ISL"
@@ -105,6 +138,7 @@ model_paths:
 containers:
   dynamo-trtllm: "${SQUASH_FILE}"
   dynamo-sglang: "${SQUASH_FILE}"
+  dynamo-vllm: "${SQUASH_FILE}"
   nginx-sqsh: "${NGINX_SQUASH_FILE}"
   latest: "${SQUASH_FILE}"
   "${CONTAINER_KEY}": "${SQUASH_FILE}"
@@ -156,11 +190,21 @@ EOF
     LOGS_DIR="outputs/$JOB_ID/logs"
     LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
 
+    # Defensive: pre-create the logs subdir so Slurm's #SBATCH --output=...
+    # /%j/logs/sweep_%j.log can open the target file even on NFS mounts
+    # where the compute-node Slurm stepd lacks permission to mkdir -p.
+    mkdir -p "$LOGS_DIR" 2>/dev/null || true
+
     # Wait for log file to appear (also check job is still alive)
     while ! ls "$LOG_FILE" &>/dev/null; do
         if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
             echo "ERROR: Job $JOB_ID failed before creating log file"
-            scontrol show job "$JOB_ID"
+            scontrol show job "$JOB_ID" | tee "outputs/$JOB_ID/scontrol_show_job.txt" 2>/dev/null
+            # Preserve sbatch_script.sh, config.yaml, metadata, and any partial
+            # log so the failure can be diagnosed from the CI artifact.
+            if [ -d "outputs/$JOB_ID" ]; then
+                tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "outputs/$JOB_ID" .
+            fi
             exit 1
         fi
         echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."