diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index 5d58f1e6..fdb949e4 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -98,6 +98,8 @@ madengine build [OPTIONS]
 | `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) |
 | `--registry` | `-r` | TEXT | `None` | Docker registry to push images to |
 | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode |
+| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image. Omit value to auto-detect from model's `DOCKER_IMAGE_NAME` |
+| `--build-on-compute` | | FLAG | `False` | Build Docker images on SLURM compute node instead of login node |
 | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string |
 | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON |
 | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache |
@@ -142,6 +144,16 @@ madengine build --tags model \
 
 # Real-time output with verbose logging
 madengine build --tags model --live-output --verbose
+
+# Use pre-built image (skip Docker build)
+madengine build --tags sglang_disagg \
+  --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \
+  --additional-context-file slurm-config.json
+
+# Build on SLURM compute node instead of login node
+madengine build --tags model \
+  --build-on-compute \
+  --additional-context-file slurm-config.json
 ```
 
 **Required Context for Build:**
@@ -170,6 +182,159 @@ When using `--batch-manifest`, provide a JSON file with selective build configur
 
 See [Batch Build Guide](batch-build.md) for details.
 
+**Pre-built Image Mode (`--use-image`):**
+
+Skip Docker build and use an existing image from a registry or local Docker cache:
+
+```bash
+# Auto-detect image from model card's DOCKER_IMAGE_NAME env var
+madengine build --tags sglang_disagg \
+  --use-image \
+  --additional-context-file config.json
+
+# Explicitly specify image from Docker Hub
+madengine build --tags sglang_disagg \
+  --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \
+  --additional-context-file config.json
+
+# Use image from NGC
+madengine build --tags model \
+  --use-image nvcr.io/nvidia/pytorch:24.01-py3
+
+# Use locally cached image
+madengine build --tags model \
+  --use-image my-local-image:latest
+```
+
+**Image Resolution Priority:**
+1. If `--use-image <name>` is specified, use that image
+2. If `--use-image` (no value), auto-detect from model card's `DOCKER_IMAGE_NAME` env var
+3. If no image found in model card, error with helpful suggestions
+
+**Multiple Models Warning:**
+When using auto-detection with multiple models that have different `DOCKER_IMAGE_NAME` values, the first model's image is used and a warning is printed.
+
+**Mutual Exclusivity:**
+- `--use-image` cannot be used with `--registry` (push requires local build)
+- `--use-image` cannot be used with `--build-on-compute` (skip build vs. build on compute)
+
+**When to use `--use-image`:**
+- Using official framework images (SGLang, vLLM, etc.)
+- Image is pre-cached on compute nodes
+- Testing without rebuilding
+- CI/CD pipelines with external images
+
+The generated manifest marks the image as `"prebuilt": true` with `build_time: 0`.
+
+**Build on Compute Node (`--build-on-compute`):**
+
+Build Docker images on a SLURM compute node, push to registry, and pull in parallel during run phase:
+
+```bash
+# Build on compute node and push to registry (--registry REQUIRED)
+madengine build --tags model \
+  --build-on-compute \
+  --registry docker.io/myorg \
+  --additional-context-file slurm-config.json
+```
+
+**Required:** `--registry` must be specified with `--build-on-compute`.
+
+**SLURM Config Priority:**
+1. Model card's `slurm` section (base configuration)
+2. `--additional-context` overrides (command line takes precedence)
+
+If the model card already has `slurm` config, you only need to provide missing or override values:
+
+```bash
+# Model card has partition/time, just override reservation
+madengine build --tags model \
+  --build-on-compute \
+  --registry docker.io/myorg \
+  --additional-context '{"slurm": {"reservation": "my-res"}}'
+```
+
+**When to use `--build-on-compute`:**
+- Login node has limited disk space or resources
+- Build requires GPU access (e.g., AOT compilation)
+- Login node policies prohibit heavy workloads
+- Distributing images to many compute nodes (build once, pull everywhere)
+
+**How it works:**
+
+*Build Phase:*
+1. Discovers model and merges SLURM config (model card + additional-context)
+2. Submits build job to **1 compute node** via `sbatch --wait`
+3. Builds Docker image on that node
+4. Pushes image to registry
+5. Generates manifest with registry image name
+
+*Run Phase:*
+1. Detects `built_on_compute: true` in manifest
+2. Pulls image **in parallel on ALL nodes** via `srun docker pull`
+3. Executes model script
+
+**Inside existing SLURM allocation:**
+
+If you're already inside an `salloc` allocation, `--build-on-compute` uses `srun` directly instead of submitting a new job.
+
+**Error Messages:**
+
+If required SLURM fields are missing, specific errors are shown:
+- Missing `partition`: "Add partition to model card's slurm section or via --additional-context"
+
+---
+
+**Multi-Node SLURM Launcher (`slurm_multi`):**
+
+Models using the `slurm_multi` launcher (for multi-node distributed inference) **require** either `--registry` or `--use-image`:
+
+```bash
+# Option 1: Build and push to registry
+madengine build --tags sglang_model \
+  --registry docker.io/myorg \
+  --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}'
+
+# Option 2: Use pre-built image from registry
+madengine build --tags sglang_model \
+  --use-image docker.io/myorg/sglang:latest
+
+# Option 3: Build on compute and push
+madengine build --tags sglang_model \
+  --build-on-compute \
+  --registry docker.io/myorg \
+  --additional-context-file config.json
+```
+
+**Why this requirement?**
+
+Multi-node SLURM jobs run on multiple compute nodes. Each node needs access to the Docker image:
+- Local builds only exist on the login/build node
+- Compute nodes cannot access locally built images
+- Registry images enable parallel `docker pull` on all nodes
+
+**Parallel Image Pull:**
+
+During `madengine run`, images from a registry are automatically pulled in parallel on all allocated nodes:
+
+```bash
+srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES docker pull <image>
+```
+
+This ensures fast, consistent image availability across the cluster.
+
+**Re-using Images:**
+
+For subsequent runs with the same image, use `--use-image` to skip building:
+
+```bash
+# First run: build and push
+madengine build --tags model --registry docker.io/myorg
+
+# Subsequent runs: use pre-built image
+madengine build --tags model --use-image docker.io/myorg/model:latest
+```
+
 ---
 
 ### `run` - Execute Models
diff --git a/docs/deployment.md b/docs/deployment.md
index 794fe3aa..d5545f5f 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -258,6 +258,39 @@ SLURM automatically provides:
 - Network interface configuration
 - Rank assignment via `$SLURM_PROCID`
 
+### SLURM Allocation Detection
+
+madengine automatically detects if you're running inside an existing SLURM allocation (via `salloc`):
+
+```bash
+# Allocate nodes interactively
+salloc -N 3 -p gpu --gpus-per-node=8 -t 04:00:00
+
+# madengine detects the allocation automatically
+madengine run --manifest-file build_manifest.json
+# Output: ✓ Detected existing SLURM allocation: Job 12345
+#         Allocation has 3 nodes available
+```
+
+**Behavior inside allocation:**
+- Uses `srun` directly instead of `sbatch`
+- Validates requested nodes ≤ available nodes
+- Warns if using fewer nodes than allocated
+- Skips job submission (already allocated)
+
+**Build inside allocation:**
+
+```bash
+# Inside salloc session
+madengine build --tags model --build-on-compute
+# Uses srun instead of sbatch --wait
+```
+
+**Environment variables detected:**
+- `SLURM_JOB_ID` - Indicates inside allocation
+- `SLURM_NNODES` - Number of nodes available
+- `SLURM_NODELIST` - List of allocated nodes
+
 ### Monitoring
 
 ```bash
@@ -372,6 +405,60 @@ scancel -u $USER
 }
 ```
 
+### Baremetal Execution (slurm_multi)
+
+For disaggregated inference workloads like SGLang Disaggregated, madengine supports baremetal execution where the model's `.slurm` script manages Docker containers directly:
+
+```json
+{
+  "slurm": {
+    "partition": "gpu",
+    "nodes": 3,
+    "gpus_per_node": 8,
+    "time": "04:00:00"
+  },
+  "distributed": {
+    "launcher": "slurm_multi",
+    "nnodes": 3,
+    "nproc_per_node": 8,
+    "sglang_disagg": {
+      "prefill_nodes": 1,
+      "decode_nodes": 1
+    }
+  }
+}
+```
+
+**How baremetal execution works:**
+1. madengine generates a wrapper script (not a Docker container)
+2. The wrapper runs the model's `.slurm` script directly on baremetal
+3. The `.slurm` script manages Docker containers via `srun`
+4. Environment variables from `models.json` and `additional-context` are passed through
+
+**When to use `slurm_multi`:**
+- SGLang Disaggregated inference (proxy + prefill + decode nodes)
+- Workloads requiring direct SLURM node control
+- Custom Docker orchestration via `.slurm` scripts
+
+**Registry Requirement:**
+
+Models using `slurm_multi` **require** either `--registry` or `--use-image` during build:
+
+```bash
+# Option 1: Build and push to registry
+madengine build --tags model --registry docker.io/myorg
+
+# Option 2: Use pre-built image
+madengine build --tags model --use-image
+
+# Option 3: Build on compute and push
+madengine build --tags model --build-on-compute --registry docker.io/myorg
+```
+
+This ensures all compute nodes can pull the image in parallel during `madengine run`.
+
+See [Launchers Guide](launchers.md#7-sglang-disaggregated-new) for detailed configuration.
+
 ## Troubleshooting
 
 ### Kubernetes Issues
diff --git a/docs/launchers.md b/docs/launchers.md
index b4ae7d34..99b130c7 100644
--- a/docs/launchers.md
+++ b/docs/launchers.md
@@ -364,7 +364,7 @@ SGLang Disaggregated separates inference into specialized node pools:
 ```json
 {
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 5,
     "nproc_per_node": 8,
     "sglang_disagg": {
@@ -403,7 +403,7 @@ Override automatic split based on workload characteristics:
 ```json
 {
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 7,
     "nproc_per_node": 8,
     "sglang_disagg": {
@@ -436,6 +436,18 @@ Override automatic split based on workload characteristics:
 - Ray cluster coordination
 - No torchrun needed (manages own processes)
 
+**Registry Requirement (SLURM)**:
+
+Models using `slurm_multi` launcher **require** `--registry` or `--use-image` during build:
+
+```bash
+madengine build --tags model --registry docker.io/myorg
+# OR
+madengine build --tags model --use-image
+```
+
+This ensures all compute nodes can pull the image in parallel during `madengine run`.
+
 **Environment Variables (K8s)**:
 ```bash
 POD_INDEX=${JOB_COMPLETION_INDEX}  # Pod index for role assignment
@@ -457,12 +469,12 @@ SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..."
 ```
 
 **Examples**:
-- K8s Minimal: `examples/k8s-configs/minimal/sglang-disagg-minimal.json`
-- K8s Basic: `examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json`
-- K8s Custom: `examples/k8s-configs/basic/sglang-disagg-custom-split.json`
-- SLURM Minimal: `examples/slurm-configs/minimal/sglang-disagg-minimal.json`
-- SLURM Basic: `examples/slurm-configs/basic/sglang-disagg-multi-node.json`
-- SLURM Custom: `examples/slurm-configs/basic/sglang-disagg-custom-split.json`
+- K8s Minimal: `examples/k8s-configs/minimal/slurm-multi-minimal.json`
+- K8s Basic: `examples/k8s-configs/basic/slurm-multi-multi-node-basic.json`
+- K8s Custom: `examples/k8s-configs/basic/slurm-multi-custom-split.json`
+- SLURM Minimal: `examples/slurm-configs/minimal/slurm-multi-minimal.json`
+- SLURM Basic: `examples/slurm-configs/basic/slurm-multi-multi-node.json`
+- SLURM Custom: `examples/slurm-configs/basic/slurm-multi-custom-split.json`
 
 **Comparison: SGLang vs SGLang Disaggregated**:
 
@@ -681,7 +693,7 @@ SGLANG_NODE_RANK=${SLURM_PROCID}
 ```bash
 Error: Unknown launcher type 'xyz'
 ```
-Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `sglang-disagg`
+Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `slurm_multi`
 
 **2. Multi-Node Communication Fails**
 ```bash
diff --git a/docs/usage.md b/docs/usage.md
index 89ebd415..bdee5cb3 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -273,6 +273,99 @@ madengine build --batch-manifest batch.json \
 ]
 ```
 
+### Pre-built Image Mode
+
+Skip Docker build entirely and use an existing image:
+
+```bash
+# Auto-detect image from model card's DOCKER_IMAGE_NAME env var
+madengine build --tags sglang_disagg \
+  --use-image \
+  --additional-context-file slurm-config.json
+
+# Or explicitly specify the image
+madengine build --tags sglang_disagg \
+  --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \
+  --additional-context-file slurm-config.json
+
+# Then run normally
+madengine run --manifest-file build_manifest.json
+```
+
+**Image Resolution:**
+1. If `--use-image <name>` provided → use that image
+2. If `--use-image` (no value) → auto-detect from model card's `DOCKER_IMAGE_NAME`
+3. If no image found → error with helpful message
+
+**Mutual Exclusivity:**
+- Cannot use with `--registry` (push requires local build)
+- Cannot use with `--build-on-compute` (skip vs. build)
+
+**Use cases:**
+- Official framework images (SGLang, vLLM, PyTorch NGC)
+- Pre-cached images on compute nodes
+- Quick testing without rebuild time
+- CI/CD with external registries
+
+The manifest marks the image as `"prebuilt": true` with zero build time.
+
+### Build on Compute Node
+
+For SLURM environments where login nodes have limited resources:
+
+```bash
+# Build on 1 compute node, push to registry, pull in parallel at runtime
+madengine build --tags model \
+  --build-on-compute \
+  --registry docker.io/myorg \
+  --additional-context '{"slurm": {"reservation": "my-res"}}'
+```
+
+**Required:** `--registry` must be specified.
+
+**SLURM Config Merging:**
+- Model card's `slurm` section provides base configuration
+- `--additional-context` overrides specific fields
+- Only specify what's missing or needs override
+
+**How it works:**
+
+*Build Phase:*
+1. Builds Docker image on **1 compute node**
+2. Pushes image to registry
+3. Stores registry image name in manifest
+
+*Run Phase:*
+1. Pulls image **in parallel on ALL nodes** via `srun docker pull`
+2. Executes model script
+
+**Benefits:**
+- Offloads heavy build to compute resources
+- Build once, distribute via registry pull
+- Respects login node resource policies
+- Parallel pull scales to many nodes
+
+### Multi-Node SLURM (slurm_multi)
+
+Models using the `slurm_multi` launcher **require** either `--registry` or `--use-image`:
+
+```bash
+# Option 1: Build and push
+madengine build --tags sglang_model --registry docker.io/myorg
+
+# Option 2: Use pre-built image
+madengine build --tags sglang_model --use-image
+
+# Option 3: Build on compute
+madengine build --tags sglang_model --build-on-compute --registry docker.io/myorg
+```
+
+**Why?** Multi-node jobs run on multiple compute nodes. Each node needs the Docker image, and local builds only exist on the login node.
+
+**Parallel Pull:** During `madengine run`, registry images are automatically pulled in parallel on all nodes before execution.
+
+**Re-using images:** For subsequent runs with the same image, use `--use-image` to skip building.
+
 ## Run Workflow
 
 ### Local Execution
diff --git a/examples/k8s-configs/basic/sglang-disagg-custom-split.json b/examples/k8s-configs/basic/slurm-multi-custom-split.json
similarity index 97%
rename from examples/k8s-configs/basic/sglang-disagg-custom-split.json
rename to examples/k8s-configs/basic/slurm-multi-custom-split.json
index 49aeecb1..12fd2a4a 100644
--- a/examples/k8s-configs/basic/sglang-disagg-custom-split.json
+++ b/examples/k8s-configs/basic/slurm-multi-custom-split.json
@@ -26,7 +26,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 7,
     "nproc_per_node": 8,
     "master_port": 29500,
diff --git a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json
similarity index 97%
rename from examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json
rename to examples/k8s-configs/basic/slurm-multi-multi-node-basic.json
index c16fd342..94f51fc5 100644
--- a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json
+++ b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json
@@ -25,7 +25,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 5,
     "nproc_per_node": 8,
     "master_port": 29500
diff --git a/examples/k8s-configs/minimal/sglang-disagg-minimal.json b/examples/k8s-configs/minimal/slurm-multi-minimal.json
similarity index 92%
rename from examples/k8s-configs/minimal/sglang-disagg-minimal.json
rename to examples/k8s-configs/minimal/slurm-multi-minimal.json
index f0f6ad05..1aab4386 100644
--- a/examples/k8s-configs/minimal/sglang-disagg-minimal.json
+++ b/examples/k8s-configs/minimal/slurm-multi-minimal.json
@@ -11,7 +11,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 3,
     "nproc_per_node": 1
   }
diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/slurm-multi-custom-split.json
similarity index 98%
rename from examples/slurm-configs/basic/sglang-disagg-custom-split.json
rename to examples/slurm-configs/basic/slurm-multi-custom-split.json
index 291a5938..0c7962aa 100644
--- a/examples/slurm-configs/basic/sglang-disagg-custom-split.json
+++ b/examples/slurm-configs/basic/slurm-multi-custom-split.json
@@ -25,7 +25,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 7,
     "nproc_per_node": 8,
     "backend": "nccl",
diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/slurm-multi-multi-node.json
similarity index 98%
rename from examples/slurm-configs/basic/sglang-disagg-multi-node.json
rename to examples/slurm-configs/basic/slurm-multi-multi-node.json
index 0c5ec00d..f6ea63e8 100644
--- a/examples/slurm-configs/basic/sglang-disagg-multi-node.json
+++ b/examples/slurm-configs/basic/slurm-multi-multi-node.json
@@ -24,7 +24,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 5,
     "nproc_per_node": 8,
     "backend": "nccl",
diff --git a/examples/slurm-configs/minimal/sglang-disagg-minimal.json b/examples/slurm-configs/minimal/slurm-multi-minimal.json
similarity index 93%
rename from examples/slurm-configs/minimal/sglang-disagg-minimal.json
rename to examples/slurm-configs/minimal/slurm-multi-minimal.json
index ee4ad9f2..95fb9761 100644
--- a/examples/slurm-configs/minimal/sglang-disagg-minimal.json
+++ b/examples/slurm-configs/minimal/slurm-multi-minimal.json
@@ -14,7 +14,7 @@
   },
   
   "distributed": {
-    "launcher": "sglang-disagg",
+    "launcher": "slurm_multi",
     "nnodes": 3,
     "nproc_per_node": 1
   }
diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py
index 99166a47..594d0661 100644
--- a/src/madengine/cli/commands/build.py
+++ b/src/madengine/cli/commands/build.py
@@ -55,6 +55,22 @@ def build(
             "--batch-manifest", help="Input batch.json file for batch build mode"
         ),
     ] = None,
+    use_image: Annotated[
+        Optional[str],
+        typer.Option(
+            "--use-image",
+            is_flag=False,
+            flag_value="auto",
+            help="Skip Docker build and use pre-built image. Optionally specify image name, or omit to auto-detect from model card's DOCKER_IMAGE_NAME"
+        ),
+    ] = None,
+    build_on_compute: Annotated[
+        bool,
+        typer.Option(
+            "--build-on-compute",
+            help="Build Docker images on SLURM compute node instead of login node"
+        ),
+    ] = False,
     additional_context: Annotated[
         str,
         typer.Option(
@@ -110,6 +126,31 @@ def build(
         )
         raise typer.Exit(ExitCode.INVALID_ARGS)
 
+    if use_image and registry:
+        console.print(
+            "❌ [bold red]Error: Cannot specify both --use-image and --registry options[/bold red]\n"
+            "[yellow]Use --use-image for pre-built external images.[/yellow]\n"
+            "[yellow]Use --registry to push locally built images.[/yellow]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
+    if use_image and build_on_compute:
+        console.print(
+            "❌ [bold red]Error: Cannot specify both --use-image and --build-on-compute options[/bold red]\n"
+            "[yellow]--use-image skips Docker build entirely.[/yellow]\n"
+            "[yellow]--build-on-compute builds on SLURM compute nodes.[/yellow]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
+    if build_on_compute and not registry:
+        console.print(
+            "❌ [bold red]Error: --build-on-compute requires --registry option[/bold red]\n"
+            "[yellow]Build on compute node pushes image to registry.[/yellow]\n"
+            "[yellow]Run phase will pull image in parallel on all nodes.[/yellow]\n"
+            "[dim]Example: --build-on-compute --registry docker.io/myorg[/dim]"
+        )
+        raise typer.Exit(ExitCode.INVALID_ARGS)
+
     # Process batch manifest if provided
     batch_data = None
     effective_tags = processed_tags
@@ -167,8 +208,8 @@ def build(
         )
 
     try:
-        # Validate additional context
-        validate_additional_context(additional_context, additional_context_file)
+        # Validate additional context (gpu_vendor/guest_os optional when using pre-built image)
+        validate_additional_context(additional_context, additional_context_file, use_image)
 
         # Create arguments object
         args = create_args_namespace(
@@ -183,6 +224,8 @@ def build(
             verbose=verbose,
             _separate_phases=True,
             batch_build_metadata=batch_build_metadata if batch_build_metadata else None,
+            use_image=use_image,
+            build_on_compute=build_on_compute,
         )
 
         # Initialize orchestrator in build-only mode
@@ -203,6 +246,8 @@ def build(
                 clean_cache=clean_docker_cache,
                 manifest_output=manifest_output,
                 batch_build_metadata=batch_build_metadata,
+                use_image=use_image,
+                build_on_compute=build_on_compute,
             )
             
             # Load build summary for display
diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py
index 500232d7..d7af9c7e 100644
--- a/src/madengine/cli/utils.py
+++ b/src/madengine/cli/utils.py
@@ -165,7 +165,7 @@ def extract_model_name(item):
                 else:
                     model_name = docker_image
                 return model_name
-        return str(item)[:20]
+        return str(item)
 
     # Helper function to format numbers
     def format_number(value):
@@ -247,7 +247,7 @@ def format_number(value):
                 row_index += 1
         else:
             # Fallback for non-dict items
-            model_name = str(item)[:20]
+            model_name = str(item)
             if has_node_data:
                 row = [str(row_index), "✅ Success", model_name, "node-0", "-", "-"]
             else:
@@ -378,7 +378,7 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row
         perf_table.add_column("Index", justify="right", style="dim")
         perf_table.add_column("Model", style="cyan")
         perf_table.add_column("Topology", justify="center", style="blue")
-        perf_table.add_column("Launcher", justify="center", style="magenta")  # Distributed launcher
+        perf_table.add_column("Workload", justify="center", style="magenta")
         perf_table.add_column("Deployment", justify="center", style="cyan")
         perf_table.add_column("GPU Arch", style="yellow")
         perf_table.add_column("Performance", justify="right", style="green")
@@ -475,7 +475,7 @@ def format_performance(perf):
                 str(idx),
                 model,
                 topology,
-                launcher,           # Distributed launcher (docker, torchrun, vllm, etc.)
+                launcher,           # Workload type (sglang-disagg, vllm, torchrun, etc.)
                 deployment_type,
                 gpu_arch,
                 performance,
diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py
index 6bfc7bdb..6a5d5013 100644
--- a/src/madengine/cli/validators.py
+++ b/src/madengine/cli/validators.py
@@ -26,6 +26,7 @@
 def validate_additional_context(
     additional_context: str,
     additional_context_file: Optional[str] = None,
+    use_image: Optional[str] = None,
 ) -> Dict[str, str]:
     """
     Validate and parse additional context.
@@ -33,6 +34,7 @@ def validate_additional_context(
     Args:
         additional_context: JSON string containing additional context
         additional_context_file: Optional file containing additional context
+        use_image: Optional pre-built image to use (skips required field validation)
 
     Returns:
         Dict containing parsed additional context
@@ -65,6 +67,36 @@ def validate_additional_context(
             console.print("💡 Please provide valid JSON format")
             raise typer.Exit(ExitCode.INVALID_ARGS)
 
+    # When using pre-built image, gpu_vendor and guest_os are optional
+    # They will be extracted from model card env_vars if needed
+    using_prebuilt_image = use_image and use_image.lower() not in ["none", ""]
+    
+    if using_prebuilt_image:
+        # For pre-built images, context is optional - will use model card env_vars
+        if not context:
+            console.print("ℹ️  No additional context provided (using pre-built image)")
+            console.print("💡 gpu_vendor and guest_os will be read from model card env_vars if needed")
+            return {}
+        
+        # If context provided, validate any gpu_vendor/guest_os values present
+        if "gpu_vendor" in context:
+            gpu_vendor = context["gpu_vendor"].upper()
+            if gpu_vendor not in VALID_GPU_VENDORS:
+                console.print(f"❌ Invalid gpu_vendor: [red]{context['gpu_vendor']}[/red]")
+                console.print(f"💡 Supported values: [green]{', '.join(VALID_GPU_VENDORS)}[/green]")
+                raise typer.Exit(ExitCode.INVALID_ARGS)
+        
+        if "guest_os" in context:
+            guest_os = context["guest_os"].upper()
+            if guest_os not in VALID_GUEST_OS:
+                console.print(f"❌ Invalid guest_os: [red]{context['guest_os']}[/red]")
+                console.print(f"💡 Supported values: [green]{', '.join(VALID_GUEST_OS)}[/green]")
+                raise typer.Exit(ExitCode.INVALID_ARGS)
+        
+        console.print("✅ Context validated (pre-built image mode)")
+        return context
+
+    # For Dockerfile builds, require gpu_vendor and guest_os
     if not context:
         console.print("❌ [red]No additional context provided[/red]")
         console.print(
@@ -81,14 +113,17 @@ def validate_additional_context(
 
 [bold cyan]Required fields:[/bold cyan]
 • gpu_vendor: [green]AMD[/green], [green]NVIDIA[/green]
-• guest_os: [green]UBUNTU[/green], [green]CENTOS[/green]""",
+• guest_os: [green]UBUNTU[/green], [green]CENTOS[/green]
+
+[bold cyan]Or use a pre-built image:[/bold cyan]
+madengine build --tags dummy --use-image auto""",
             title="Additional Context Help",
             border_style="blue",
         )
         console.print(example_panel)
         raise typer.Exit(ExitCode.INVALID_ARGS)
 
-    # Validate required fields
+    # Validate required fields for Dockerfile builds
     required_fields = ["gpu_vendor", "guest_os"]
     missing_fields = [field for field in required_fields if field not in context]
 
@@ -97,7 +132,10 @@ def validate_additional_context(
             f"❌ Missing required fields: [red]{', '.join(missing_fields)}[/red]"
         )
         console.print(
-            "💡 Both gpu_vendor and guest_os are required for build operations"
+            "💡 Both gpu_vendor and guest_os are required for Dockerfile builds"
+        )
+        console.print(
+            "💡 Or use --use-image to skip Dockerfile build"
         )
         raise typer.Exit(ExitCode.INVALID_ARGS)
 
diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py
index 33a338a9..090b2f5a 100644
--- a/src/madengine/deployment/base.py
+++ b/src/madengine/deployment/base.py
@@ -51,6 +51,7 @@ class DeploymentResult:
     metrics: Optional[Dict[str, Any]] = None
     logs_path: Optional[str] = None
     artifacts: Optional[List[str]] = None
+    skip_monitoring: bool = False  # Set True for synchronous runs (e.g., inside salloc)
 
     @property
     def is_success(self) -> bool:
@@ -164,7 +165,8 @@ def execute(self) -> DeploymentResult:
                 return result
 
             # Step 4: Monitor (optional)
-            if self.config.monitor:
+            # Skip monitoring if deploy() already ran synchronously (e.g., inside salloc)
+            if self.config.monitor and not result.skip_monitoring:
                 result = self._monitor_until_complete(result.deployment_id)
 
             # Step 5: Collect Results (always collect, even on failure to record failed runs)
diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py
index 9d5b6e9f..6cd15809 100644
--- a/src/madengine/deployment/kubernetes.py
+++ b/src/madengine/deployment/kubernetes.py
@@ -51,7 +51,13 @@
     "megatron-lm",
     "vllm",
     "sglang",
-    "sglang-disagg"
+    "slurm_multi",
+]
+
+# slurm_multi launcher name variants (underscore and hyphen)
+SLURM_MULTI_ALIASES = [
+    "slurm_multi",
+    "slurm-multi",
 ]
 
 
@@ -959,20 +965,20 @@ def _prepare_template_context(
                 model_script=model_info.get("scripts", "run.sh")
             )
 
-        elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg":
+        elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]:
             if nnodes < 3:
                 raise ValueError(
-                    f"SGLang Disaggregated requires minimum 3 nodes "
+                    f"slurm_multi launcher requires minimum 3 nodes "
                     f"(1 proxy + 1 prefill + 1 decode), got {nnodes}"
                 )
             
             # Always create headless service for disaggregated architecture
             create_headless_service = True
-            self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]")
+            self.console.print(f"[dim]slurm_multi: Creating headless service for {nnodes} pods[/dim]")
             self.console.print(f"[dim]  Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]")
             
-            # Generate SGLang Disaggregated launcher command
-            launcher_command = self._generate_sglang_disagg_command(
+            # Generate slurm_multi launcher command
+            launcher_command = self._generate_slurm_multi_command(
                 nnodes=nnodes,
                 nproc_per_node=nproc_per_node,
                 master_port=master_port,
@@ -1631,13 +1637,13 @@ def _generate_torchtitan_command(
     --tee=3 \\
     {model_script}"""
     
-    def _generate_sglang_disagg_command(
+    def _generate_slurm_multi_command(
         self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str
     ) -> str:
         """
-        Generate SGLang Disaggregated launcher command for K8s Indexed Jobs.
+        Generate slurm_multi launcher command for K8s Indexed Jobs.
         
-        SGLang Disaggregated uses separate node pools for:
+        slurm_multi uses separate node pools for:
         - Proxy (index 0): Load balancer and request router
         - Prefill (indices 1 to xP): Prompt processing
         - Decode (indices xP+1 to end): Token generation
@@ -1656,7 +1662,7 @@ def _generate_sglang_disagg_command(
             model_script: Path to model launch script
             
         Returns:
-            Complete disaggregated launch setup
+            Complete multi-node launch setup
             
         Raises:
             ValueError: If nnodes < 3 or invalid parameters
@@ -1664,7 +1670,7 @@ def _generate_sglang_disagg_command(
         # Validate
         if not isinstance(nnodes, int) or nnodes < 3:
             raise ValueError(
-                f"SGLang Disaggregated requires minimum 3 nodes, got {nnodes}"
+                f"slurm_multi requires minimum 3 nodes, got {nnodes}"
             )
         if not isinstance(nproc_per_node, int) or nproc_per_node < 1:
             raise ValueError(f"nproc_per_node must be >= 1, got {nproc_per_node}")
diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py
index 8a4a1f7b..fe588954 100644
--- a/src/madengine/deployment/slurm.py
+++ b/src/madengine/deployment/slurm.py
@@ -33,7 +33,13 @@
     "megatron-lm",
     "vllm",
     "sglang",
-    "sglang-disagg"
+    "slurm_multi",
+]
+
+# slurm_multi launcher name variants (underscore and hyphen)
+SLURM_MULTI_ALIASES = [
+    "slurm_multi",
+    "slurm-multi",
 ]
 
 
@@ -249,6 +255,7 @@ def __init__(self, config: DeploymentConfig):
         self.gpus_per_node = self.slurm_config.get("gpus_per_node", 8)
         self.time_limit = self.slurm_config.get("time", "24:00:00")
         self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_output"))
+        self.reservation = self.slurm_config.get("reservation", None)
 
         # Setup Jinja2 template engine
         template_dir = Path(__file__).parent / "templates" / "slurm"
@@ -261,6 +268,115 @@ def __init__(self, config: DeploymentConfig):
         # Generated script path
         self.script_path = None
 
+        # ========== OPTION 2: Detect existing SLURM allocation ==========
+        # If SLURM_JOB_ID exists, we're inside an salloc allocation
+        self.inside_allocation = os.environ.get("SLURM_JOB_ID") is not None
+        self.existing_job_id = os.environ.get("SLURM_JOB_ID", "")
+        self.allocation_nodes = self._get_allocation_node_count()
+        
+        if self.inside_allocation:
+            self.console.print(
+                f"[cyan]✓ Detected existing SLURM allocation: Job {self.existing_job_id}[/cyan]"
+            )
+            self.console.print(
+                f"  Allocation has {self.allocation_nodes} nodes available"
+            )
+
+    def _get_allocation_node_count(self) -> int:
+        """
+        Get number of nodes in current SLURM allocation.
+        
+        Note: SLURM_NNODES reflects the current job step, not the full allocation.
+        We query the job directly using scontrol to get the actual node count.
+        """
+        if not self.inside_allocation:
+            return 0
+        
+        job_id = self.existing_job_id
+        
+        # Query the actual job's node count using scontrol (most accurate)
+        try:
+            result = subprocess.run(
+                ["scontrol", "show", "job", job_id],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                # Parse NumNodes=X from output
+                for line in result.stdout.split("\n"):
+                    if "NumNodes=" in line:
+                        # Format: "NumNodes=3 NumCPUs=..."
+                        for part in line.split():
+                            if part.startswith("NumNodes="):
+                                try:
+                                    return int(part.split("=")[1])
+                                except (ValueError, IndexError):
+                                    pass
+        except Exception:
+            pass
+        
+        # Fallback: Try SLURM_JOB_NUM_NODES (full job node count, if set)
+        job_num_nodes = os.environ.get("SLURM_JOB_NUM_NODES")
+        if job_num_nodes:
+            try:
+                return int(job_num_nodes)
+            except ValueError:
+                pass
+        
+        # Fallback: SLURM_NNODES (may be step-specific, not full allocation)
+        nnodes = os.environ.get("SLURM_NNODES")
+        if nnodes:
+            try:
+                return int(nnodes)
+            except ValueError:
+                pass
+        
+        # Last resort: count nodes in SLURM_NODELIST
+        nodelist = os.environ.get("SLURM_NODELIST")
+        if nodelist:
+            try:
+                result = subprocess.run(
+                    ["scontrol", "show", "hostname", nodelist],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    return len(result.stdout.strip().split("\n"))
+            except Exception:
+                pass
+        
+        return 0
+
+    def _validate_allocation_nodes(self) -> tuple[bool, str]:
+        """
+        Validate that existing allocation has enough nodes for the job.
+        
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        if not self.inside_allocation:
+            return True, ""
+        
+        requested_nodes = self.nodes
+        available_nodes = self.allocation_nodes
+        
+        if available_nodes < requested_nodes:
+            return False, (
+                f"Insufficient nodes in current allocation. "
+                f"Requested: {requested_nodes}, Available: {available_nodes}. "
+                f"Either reduce nodes in config or use a larger allocation."
+            )
+        
+        if available_nodes > requested_nodes:
+            self.console.print(
+                f"[yellow]⚠ Note: Using {requested_nodes} of {available_nodes} "
+                f"available nodes in allocation[/yellow]"
+            )
+        
+        return True, ""
+
     def validate(self) -> bool:
         """Validate SLURM commands are available locally."""
         # Check required SLURM CLI tools
@@ -362,11 +478,6 @@ def prepare(self) -> bool:
         """Generate sbatch script from template."""
         # Validate environment BEFORE generating job scripts
         self.console.print("\n[bold]Validating submission environment...[/bold]")
-        if not self._validate_cli_availability():
-            self.console.print(
-                "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]"
-            )
-            return False
         
         try:
             self.output_dir.mkdir(parents=True, exist_ok=True)
@@ -379,6 +490,28 @@ def prepare(self) -> bool:
             model_key = model_keys[0]
             model_info = self.manifest["built_models"][model_key]
 
+            # Check if this is a slurm_multi launcher (baremetal multi-node)
+            # Priority: model_info.distributed.launcher > additional_context.distributed.launcher
+            model_distributed = model_info.get("distributed", {})
+            launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun")
+            launcher_normalized = launcher_type.lower().replace("_", "-")
+            
+            # Check against slurm_multi aliases (includes legacy sglang-disagg, vllm-disagg)
+            slurm_multi_aliases_normalized = [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]
+            if launcher_normalized in slurm_multi_aliases_normalized:
+                # For slurm_multi launchers, generate simple wrapper script
+                # that runs the model's .slurm script directly on baremetal
+                self.console.print(f"[cyan]Detected slurm_multi launcher: {launcher_type}[/cyan]")
+                # Pass model_key as docker_image_name (for manifests, the key IS the built image name)
+                return self._prepare_baremetal_script(model_info, docker_image_name=model_key)
+            
+            # Standard flow: validate madengine availability for complex job template
+            if not self._validate_cli_availability():
+                self.console.print(
+                    "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]"
+                )
+                return False
+
             # Prepare template context
             context = self._prepare_template_context(model_info)
 
@@ -400,6 +533,201 @@ def prepare(self) -> bool:
             self.console.print(f"[red]✗ Failed to generate script: {e}[/red]")
             return False
 
+    def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool:
+        """
+        Generate a simple wrapper script for baremetal/slurm_multi launchers.
+        
+        These launchers (slurm_multi, sglang-disagg, vllm-disagg) run the model's 
+        .slurm script directly on baremetal, which then manages Docker containers 
+        via srun. No madengine wrapper needed.
+        
+        Args:
+            model_info: Model configuration from manifest
+            docker_image_name: The built Docker image name from manifest key
+        """
+        # Get the model's script path
+        model_script = model_info.get("scripts", "")
+        if not model_script:
+            self.console.print("[red]✗ No scripts defined in model_info[/red]")
+            return False
+        
+        # Get manifest directory (where the model script is relative to)
+        manifest_dir = Path(self.config.manifest_file).parent.absolute()
+        model_script_path = manifest_dir / model_script
+        
+        if not model_script_path.exists():
+            self.console.print(f"[red]✗ Model script not found: {model_script_path}[/red]")
+            return False
+        
+        # Get environment variables
+        env_vars = {}
+        
+        # From model_info.env_vars
+        if "env_vars" in model_info:
+            env_vars.update(model_info["env_vars"])
+        
+        # From additional_context.env_vars
+        if "env_vars" in self.config.additional_context:
+            env_vars.update(self.config.additional_context["env_vars"])
+        
+        # From distributed config (model's distributed section)
+        model_distributed = model_info.get("distributed", {})
+        sglang_disagg_config = model_distributed.get("sglang_disagg", {}) or self.distributed_config.get("sglang_disagg", {})
+        if sglang_disagg_config:
+            env_vars["xP"] = str(sglang_disagg_config.get("prefill_nodes", 1))
+            env_vars["yD"] = str(sglang_disagg_config.get("decode_nodes", 1))
+        
+        # Override DOCKER_IMAGE_NAME with the built image from manifest
+        # This ensures the run uses the freshly built image, not the base image
+        # Priority: docker_image_name param > model_info.docker_image > env_vars.DOCKER_IMAGE_NAME
+        if docker_image_name and docker_image_name.startswith("ci-"):
+            # The manifest key IS the built image name for madengine-built images
+            self.console.print(f"[cyan]Using built Docker image: {docker_image_name}[/cyan]")
+            env_vars["DOCKER_IMAGE_NAME"] = docker_image_name
+        elif "docker_image" in model_info:
+            built_image = model_info["docker_image"]
+            self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]")
+            env_vars["DOCKER_IMAGE_NAME"] = built_image
+        elif "image" in model_info:
+            # Fallback to 'image' field
+            built_image = model_info["image"]
+            self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]")
+            env_vars["DOCKER_IMAGE_NAME"] = built_image
+        
+        # Get model args
+        model_args = model_info.get("args", "")
+        
+        # Generate simple wrapper script
+        # IMPORTANT: SBATCH directives MUST be at the top, right after #!/bin/bash
+        script_lines = [
+            "#!/bin/bash",
+            f"#SBATCH --job-name=madengine-{model_info['name']}",
+            f"#SBATCH --output={self.output_dir}/madengine-{model_info['name']}_%j.out",
+            f"#SBATCH --error={self.output_dir}/madengine-{model_info['name']}_%j.err",
+            f"#SBATCH --partition={self.partition}",
+            f"#SBATCH --nodes={self.nodes}",
+            f"#SBATCH --ntasks={self.nodes}",
+            f"#SBATCH --gpus-per-node={self.gpus_per_node}",
+            f"#SBATCH --time={self.time_limit}",
+            "#SBATCH --exclusive",
+        ]
+        
+        # Add reservation if specified
+        if self.reservation:
+            script_lines.append(f"#SBATCH --reservation={self.reservation}")
+        
+        # Add nodelist if specified (from model card or --additional-context)
+        nodelist = self._normalize_nodelist(self.slurm_config.get("nodelist"))
+        if nodelist:
+            script_lines.append(f"#SBATCH --nodelist={nodelist}")
+        
+        script_lines.extend([
+            "",
+            f"# Baremetal launcher script for {model_info['name']}",
+            f"# Generated by madengine for slurm_multi",
+            "",
+            "set -e",
+            "",
+            "# Environment variables",
+        ])
+        
+        for key, value in env_vars.items():
+            script_lines.append(f"export {key}=\"{value}\"")
+        
+        script_lines.append("")
+        script_lines.extend([
+            "echo '=========================================='",
+            "echo 'Baremetal Launcher - slurm_multi'",
+            "echo '=========================================='",
+            f"echo 'Model: {model_info['name']}'",
+            f"echo 'Script: {model_script_path}'",
+            "echo 'SLURM_JOB_ID:' $SLURM_JOB_ID",
+            "echo 'SLURM_NNODES:' $SLURM_NNODES",
+            "echo 'SLURM_NODELIST:' $SLURM_NODELIST",
+            "echo ''",
+        ])
+        
+        # Check if image needs parallel pull on all nodes
+        # Pull if: image is from registry (contains / or .) and not a local ci-* build
+        docker_image = env_vars.get("DOCKER_IMAGE_NAME", "")
+        is_registry_image = docker_image and not docker_image.startswith("ci-") and ("/" in docker_image or "." in docker_image)
+        
+        if is_registry_image:
+            # Add parallel docker pull on all nodes
+            # This ensures all nodes have the image before running
+            script_lines.extend([
+                "",
+                "# Pull Docker image in parallel on all nodes",
+                "echo '=========================================='",
+                "echo 'Pulling Docker image on all nodes in parallel'",
+                "echo '=========================================='",
+                f"echo 'Image: {docker_image}'",
+                "echo ''",
+                "",
+                f"srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c \"",
+                f"    echo \\\"[\\$(hostname)] Pulling {docker_image}...\\\"",
+                f"    docker pull {docker_image}",
+                "    PULL_RC=\\$?",
+                "    if [ \\$PULL_RC -eq 0 ]; then",
+                "        echo \\\"[\\$(hostname)] Pull SUCCESS\\\"",
+                "    else",
+                "        echo \\\"[\\$(hostname)] Pull FAILED with exit code \\$PULL_RC\\\"",
+                "    fi",
+                "    exit \\$PULL_RC",
+                "\"",
+                "PULL_EXIT=$?",
+                "",
+                "if [ $PULL_EXIT -ne 0 ]; then",
+                "    echo 'Docker pull failed on one or more nodes'",
+                "    exit $PULL_EXIT",
+                "fi",
+                "",
+                "echo ''",
+                "echo 'Docker image pulled on all nodes'",
+                "echo ''",
+            ])
+        
+        # Create completion marker path for robust completion detection
+        # Use absolute path since script will cd to different directory
+        completion_marker = (self.output_dir / f"madengine_{model_info['name']}.complete").resolve()
+        
+        script_lines.extend([
+            "",
+            "# Change to script directory",
+            f"cd {model_script_path.parent}",
+            "",
+            "# Run the model script directly on baremetal",
+            f"echo 'Executing: bash {model_script_path.name} {model_args}'",
+            f"bash {model_script_path.name} {model_args}",
+            "SCRIPT_EXIT_CODE=$?",
+            "",
+            "echo ''",
+            "echo 'Script completed.'",
+            "",
+            "# Write completion marker for madengine to detect",
+            f"echo \"exit_code=$SCRIPT_EXIT_CODE\" > {completion_marker}",
+            f"echo \"timestamp=$(date -Iseconds)\" >> {completion_marker}",
+            f"echo 'Completion marker written: {completion_marker}'",
+            "",
+            "exit $SCRIPT_EXIT_CODE",
+        ])
+        
+        # Store marker path for monitor to check
+        self._completion_marker = completion_marker
+        
+        script_content = "\n".join(script_lines)
+        
+        # Save script
+        self.script_path = self.output_dir / f"madengine_{model_info['name']}.sh"
+        self.script_path.write_text(script_content)
+        self.script_path.chmod(0o755)
+        
+        self.console.print(f"[green]✓ Generated baremetal script: {self.script_path}[/green]")
+        self.console.print(f"  Model script: {model_script_path}")
+        self.console.print(f"  Environment: {len(env_vars)} variables")
+        
+        return True
+
     @staticmethod
     def _normalize_nodelist(nodelist: Optional[str]) -> Optional[str]:
         """Normalize nodelist to comma-separated without spaces for #SBATCH --nodelist."""
@@ -523,8 +851,8 @@ def _generate_launcher_command(
             return self._generate_vllm_command(nnodes, nproc_per_node, master_port)
         elif launcher_type == "sglang":
             return self._generate_sglang_command(nnodes, nproc_per_node, master_port)
-        elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg":
-            return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port)
+        elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]:
+            return self._generate_slurm_multi_command(nnodes, nproc_per_node, master_port)
         elif launcher_type == "deepspeed":
             return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port)
         elif launcher_type == "megatron":
@@ -621,13 +949,13 @@ def _generate_sglang_command(
 export SGLANG_PIPELINE_PARALLEL_SIZE={nnodes}
 # SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed'''
 
-    def _generate_sglang_disagg_command(
+    def _generate_slurm_multi_command(
         self, nnodes: int, nproc_per_node: int, master_port: int
     ) -> str:
         """
-        Generate SGLang Disaggregated launcher environment for SLURM.
+        Generate slurm_multi launcher environment for SLURM.
         
-        SGLang Disaggregated Architecture:
+        slurm_multi Architecture (multi-node baremetal):
         - Node 0: Proxy (load balancer)
         - Nodes 1 to xP: Prefill nodes
         - Nodes xP+1 to xP+yD: Decode nodes
@@ -643,11 +971,11 @@ def _generate_sglang_disagg_command(
             Environment setup with node role assignment
             
         Raises:
-            ValueError: If nnodes < 3 (minimum for disagg)
+            ValueError: If nnodes < 3
         """
         if nnodes < 3:
             raise ValueError(
-                f"SGLang Disaggregated requires minimum 3 nodes "
+                f"slurm_multi requires minimum 3 nodes "
                 f"(1 proxy + 1 prefill + 1 decode), got {nnodes}"
             )
         
@@ -843,7 +1171,12 @@ def _generate_basic_env_command(
 # Model script should handle launcher invocation'''
 
     def deploy(self) -> DeploymentResult:
-        """Submit sbatch script to SLURM scheduler (locally)."""
+        """
+        Deploy to SLURM - either via sbatch (new job) or bash (existing allocation).
+        
+        If SLURM_JOB_ID is set (inside salloc), runs script directly with bash.
+        Otherwise, submits a new job via sbatch.
+        """
         if not self.script_path or not self.script_path.exists():
             return DeploymentResult(
                 status=DeploymentStatus.FAILED,
@@ -851,6 +1184,87 @@ def deploy(self) -> DeploymentResult:
                 message="Script not generated. Run prepare() first.",
             )
 
+        # ========== BRANCH: Inside allocation vs new job ==========
+        if self.inside_allocation:
+            return self._run_inside_existing_allocation()
+        else:
+            return self._submit_new_job()
+
+    def _run_inside_existing_allocation(self) -> DeploymentResult:
+        """
+        Run script directly inside existing salloc allocation using bash.
+        
+        The script will use the nodes already allocated to the current job.
+        SLURM environment variables (SLURM_NODELIST, etc.) are inherited.
+        """
+        # Validate node count before running
+        is_valid, error_msg = self._validate_allocation_nodes()
+        if not is_valid:
+            return DeploymentResult(
+                status=DeploymentStatus.FAILED,
+                deployment_id=self.existing_job_id,
+                message=error_msg,
+            )
+        
+        self.console.print(
+            f"\n[bold cyan]Running inside existing SLURM allocation[/bold cyan]"
+        )
+        self.console.print(f"  Job ID: {self.existing_job_id}")
+        self.console.print(f"  Using {self.nodes} of {self.allocation_nodes} allocated nodes")
+        self.console.print(f"  GPUs per node: {self.gpus_per_node}")
+        self.console.print(f"  Script: {self.script_path}")
+        self.console.print(f"\n[dim]Executing: bash {self.script_path}[/dim]\n")
+        
+        try:
+            # Run script directly with bash (synchronous, blocks until done)
+            # Don't capture output - let it stream directly to console
+            result = subprocess.run(
+                ["bash", str(self.script_path)],
+                timeout=self.config.timeout if self.config.timeout > 0 else None,
+            )
+            
+            if result.returncode == 0:
+                self.console.print(
+                    f"\n[green]✓ Script completed successfully in allocation {self.existing_job_id}[/green]"
+                )
+                return DeploymentResult(
+                    status=DeploymentStatus.SUCCESS,
+                    deployment_id=self.existing_job_id,
+                    message=f"Completed inside existing allocation {self.existing_job_id}",
+                    logs_path=str(self.output_dir),
+                    skip_monitoring=True,  # Already ran synchronously, no need to poll
+                )
+            else:
+                self.console.print(
+                    f"\n[red]✗ Script failed with exit code {result.returncode}[/red]"
+                )
+                return DeploymentResult(
+                    status=DeploymentStatus.FAILED,
+                    deployment_id=self.existing_job_id,
+                    message=f"Script failed with exit code {result.returncode}",
+                    logs_path=str(self.output_dir),
+                    skip_monitoring=True,  # Already ran synchronously
+                )
+                
+        except subprocess.TimeoutExpired:
+            self.console.print(
+                f"\n[red]✗ Script timed out after {self.config.timeout}s[/red]"
+            )
+            return DeploymentResult(
+                status=DeploymentStatus.FAILED,
+                deployment_id=self.existing_job_id,
+                message=f"Script timed out after {self.config.timeout}s",
+            )
+        except Exception as e:
+            self.console.print(f"\n[red]✗ Execution error: {e}[/red]")
+            return DeploymentResult(
+                status=DeploymentStatus.FAILED,
+                deployment_id=self.existing_job_id,
+                message=f"Execution error: {str(e)}",
+            )
+
+    def _submit_new_job(self) -> DeploymentResult:
+        """Submit new SLURM job via sbatch (original behavior)."""
         # ==================== PREFLIGHT NODE SELECTION ====================
         # For multi-node jobs with Ray/vLLM, check for clean nodes first
         # to avoid OOM errors from stale processes
@@ -863,6 +1277,7 @@ def deploy(self) -> DeploymentResult:
                     console=self.console,
                     auto_cleanup=auto_cleanup,
                     verbose=self.slurm_config.get("verbose_node_check", False),
+                    reservation=self.reservation,
                 )
                 
                 # Select clean nodes and get updated exclude list
@@ -935,6 +1350,45 @@ def deploy(self) -> DeploymentResult:
 
     def monitor(self, deployment_id: str) -> DeploymentResult:
         """Check SLURM job status (locally)."""
+        # If we ran inside an existing allocation, script already completed synchronously
+        # No need to poll - just return success (deploy() already handled the result)
+        if self.inside_allocation:
+            return DeploymentResult(
+                status=DeploymentStatus.SUCCESS,
+                deployment_id=deployment_id,
+                message=f"Completed (ran inside existing allocation {deployment_id})",
+            )
+        
+        # Check for completion marker (robust detection for interactive/salloc jobs)
+        if hasattr(self, '_completion_marker') and self._completion_marker:
+            marker_path = Path(self._completion_marker)
+            if marker_path.exists():
+                # Read exit code from marker
+                try:
+                    content = marker_path.read_text()
+                    exit_code = 0
+                    for line in content.splitlines():
+                        if line.startswith("exit_code="):
+                            exit_code = int(line.split("=")[1])
+                            break
+                    
+                    self.console.print(f"[green]✓ Completion marker found: {marker_path}[/green]")
+                    
+                    if exit_code == 0:
+                        return DeploymentResult(
+                            status=DeploymentStatus.SUCCESS,
+                            deployment_id=deployment_id,
+                            message=f"Script completed successfully (exit code {exit_code})",
+                        )
+                    else:
+                        return DeploymentResult(
+                            status=DeploymentStatus.FAILED,
+                            deployment_id=deployment_id,
+                            message=f"Script failed with exit code {exit_code}",
+                        )
+                except Exception as e:
+                    self.console.print(f"[yellow]Warning: Could not read completion marker: {e}[/yellow]")
+        
         try:
             # Query job status using squeue (runs locally)
             result = subprocess.run(
@@ -1188,11 +1642,16 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
             
             # Strategy 2: Check shared workspace (NFS) for perf.csv
             # When using shared storage, perf.csv is written directly to workspace
+            # Retry briefly to allow NFS propagation after job completion
             if not results["perf_files"]:
                 workspace_perf = Path("perf.csv")
-                if workspace_perf.exists():
-                    results["perf_files"] = [str(workspace_perf)]
-                    self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]")
+                for _attempt in range(6):
+                    if workspace_perf.exists():
+                        results["perf_files"] = [str(workspace_perf)]
+                        self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]")
+                        break
+                    import time
+                    time.sleep(5)
             
             # Parse perf.csv to populate successful_runs and failed_runs
             # Filter based on session_start_row passed as parameter (no external files!)
@@ -1250,6 +1709,14 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
 
     def cleanup(self, deployment_id: str) -> bool:
         """Cancel SLURM job if still running (locally)."""
+        # CRITICAL: Never cancel an existing allocation we're running inside!
+        # The user's salloc session should not be terminated by madengine
+        if self.inside_allocation:
+            self.console.print(
+                f"[dim]Skipping cleanup - running inside existing allocation {deployment_id}[/dim]"
+            )
+            return True
+        
         try:
             subprocess.run(
                 ["scancel", deployment_id], capture_output=True, timeout=10
diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py
index b52f53d7..a494ff76 100644
--- a/src/madengine/deployment/slurm_node_selector.py
+++ b/src/madengine/deployment/slurm_node_selector.py
@@ -71,6 +71,7 @@ def __init__(
         auto_cleanup: bool = False,
         verbose: bool = False,
         timeout: int = 30,
+        reservation: Optional[str] = None,
     ):
         """
         Initialize node selector.
@@ -80,11 +81,13 @@ def __init__(
             auto_cleanup: Automatically clean dirty nodes
             verbose: Enable verbose logging
             timeout: Timeout for srun commands (seconds)
+            reservation: SLURM reservation name for reserved nodes
         """
         self.console = console or Console()
         self.auto_cleanup = auto_cleanup
         self.verbose = verbose
         self.timeout = timeout
+        self.reservation = reservation
     
     def get_candidate_nodes(
         self,
@@ -199,16 +202,20 @@ def check_node_health(self, node: str) -> NodeStatus:
         
         try:
             # Use srun to execute check on specific node
+            srun_cmd = [
+                "srun",
+                f"--nodelist={node}",
+                "--ntasks=1",
+                "--time=00:01:00",
+                "--overlap",  # Allow overlap with running jobs
+                "--quiet",
+            ]
+            if self.reservation:
+                srun_cmd.append(f"--reservation={self.reservation}")
+            srun_cmd.extend(["bash", "-c", check_script])
+            
             result = subprocess.run(
-                [
-                    "srun",
-                    f"--nodelist={node}",
-                    "--ntasks=1",
-                    "--time=00:01:00",
-                    "--overlap",  # Allow overlap with running jobs
-                    "--quiet",
-                    "bash", "-c", check_script
-                ],
+                srun_cmd,
                 capture_output=True,
                 text=True,
                 timeout=self.timeout,
@@ -309,16 +316,20 @@ def cleanup_node(self, node: str) -> bool:
 """
         
         try:
+            srun_cmd = [
+                "srun",
+                f"--nodelist={node}",
+                "--ntasks=1",
+                "--time=00:01:00",
+                "--overlap",
+                "--quiet",
+            ]
+            if self.reservation:
+                srun_cmd.append(f"--reservation={self.reservation}")
+            srun_cmd.extend(["bash", "-c", cleanup_script])
+            
             result = subprocess.run(
-                [
-                    "srun",
-                    f"--nodelist={node}",
-                    "--ntasks=1",
-                    "--time=00:01:00",
-                    "--overlap",
-                    "--quiet",
-                    "bash", "-c", cleanup_script
-                ],
+                srun_cmd,
                 capture_output=True,
                 text=True,
                 timeout=self.timeout,
diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py
index ba011e81..46bc8b42 100644
--- a/src/madengine/execution/container_runner.py
+++ b/src/madengine/execution/container_runner.py
@@ -13,6 +13,15 @@
 import typing
 import warnings
 import re
+import subprocess
+
+# Launchers that should run directly on baremetal (not inside Docker)
+# These launchers manage their own Docker containers via SLURM srun commands
+SLURM_MULTI_ALIASES = [
+    "slurm_multi",
+    "slurm-multi",
+]
+BAREMETAL_LAUNCHERS = SLURM_MULTI_ALIASES
 from rich.console import Console as RichConsole
 from contextlib import redirect_stdout, redirect_stderr
 from madengine.core.console import Console
@@ -580,6 +589,151 @@ def apply_tools(
                 else:
                     print(f"  Note: Command '{cmd}' already added by another tool, skipping duplicate.")
 
+    def _run_on_baremetal(
+        self,
+        model_info: typing.Dict,
+        build_info: typing.Dict,
+        log_file_path: str,
+        timeout: int,
+        run_results: typing.Dict,
+        pre_encapsulate_post_scripts: typing.Dict,
+        run_env: typing.Dict,
+    ) -> typing.Dict:
+        """
+        Run script directly on baremetal (not inside Docker).
+        
+        Used for slurm_multi launchers that manage their own Docker containers
+        via SLURM srun commands. The script is executed directly on the node.
+        
+        Args:
+            model_info: Model configuration from manifest
+            build_info: Build information from manifest
+            log_file_path: Path to log file
+            timeout: Execution timeout in seconds
+            run_results: Dictionary to store run results
+            pre_encapsulate_post_scripts: Pre/post script configuration
+            run_env: Environment variables for the script
+            
+        Returns:
+            Dictionary with run results
+        """
+        import shutil
+        
+        self.rich_console.print(f"[dim]{'='*80}[/dim]")
+        
+        # Prepare script path
+        scripts_arg = model_info["scripts"]
+        
+        # Get the current working directory (might be temp workspace)
+        cwd = os.getcwd()
+        print(f"📂 Current directory: {cwd}")
+        
+        if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"):
+            script_path = scripts_arg
+            script_name = os.path.basename(scripts_arg)
+        elif scripts_arg.endswith(".py"):
+            script_path = scripts_arg
+            script_name = os.path.basename(scripts_arg)
+        else:
+            # Directory specified - look for run.sh
+            script_path = os.path.join(scripts_arg, "run.sh")
+            script_name = "run.sh"
+        
+        # If script path is relative, make it absolute from cwd
+        if not os.path.isabs(script_path):
+            script_path = os.path.join(cwd, script_path)
+        
+        # Check script exists
+        if not os.path.exists(script_path):
+            print(f"⚠️ Script not found at: {script_path}")
+            # Try alternative locations
+            alt_path = os.path.join(cwd, os.path.basename(scripts_arg))
+            if os.path.exists(alt_path):
+                script_path = alt_path
+                print(f"✓ Found at alternative location: {script_path}")
+            else:
+                raise FileNotFoundError(f"Script not found: {script_path}")
+        
+        script_dir = os.path.dirname(script_path) or cwd
+        print(f"📜 Script: {script_path}")
+        print(f"📁 Working directory: {script_dir}")
+        
+        # Prepare model arguments
+        model_args = self.context.ctx.get("model_args", model_info.get("args", ""))
+        print(f"📝 Arguments: {model_args}")
+        
+        # Build command
+        if script_path.endswith(".py"):
+            cmd = f"python3 {script_path} {model_args}"
+        else:
+            cmd = f"bash {script_path} {model_args}"
+        
+        print(f"🔧 Command: {cmd}")
+        
+        # Prepare environment
+        env = os.environ.copy()
+        env.update(run_env)
+        
+        # Add model-specific env vars from model_info
+        if "env_vars" in model_info and model_info["env_vars"]:
+            for key, value in model_info["env_vars"].items():
+                env[key] = str(value)
+                print(f"  ENV: {key}={value}")
+        
+        # Add env vars from additional_context
+        if self.additional_context and "env_vars" in self.additional_context:
+            for key, value in self.additional_context["env_vars"].items():
+                env[key] = str(value)
+        
+        # Run script with logging
+        test_start_time = time.time()
+        self.rich_console.print("\n[bold blue]Running script on baremetal...[/bold blue]")
+        
+        try:
+            with open(log_file_path, mode="w", buffering=1) as outlog:
+                with redirect_stdout(
+                    PythonicTee(outlog, self.live_output)
+                ), redirect_stderr(PythonicTee(outlog, self.live_output)):
+                    print(f"⏰ Setting timeout to {timeout} seconds.")
+                    print(f"🚀 Executing: {cmd}")
+                    print(f"📂 Working directory: {script_dir}")
+                    print(f"{'='*80}")
+                    
+                    result = subprocess.run(
+                        cmd,
+                        shell=True,
+                        cwd=script_dir,
+                        env=env,
+                        timeout=timeout if timeout > 0 else None,
+                    )
+                    
+                    run_results["test_duration"] = time.time() - test_start_time
+                    print(f"\n{'='*80}")
+                    print(f"⏱️ Test Duration: {run_results['test_duration']:.2f} seconds")
+                    
+                    if result.returncode == 0:
+                        run_results["status"] = "SUCCESS"
+                        self.rich_console.print("[bold green]✓ Script completed successfully[/bold green]")
+                    else:
+                        run_results["status"] = "FAILURE"
+                        run_results["status_detail"] = f"Exit code {result.returncode}"
+                        self.rich_console.print(f"[bold red]✗ Script failed with exit code {result.returncode}[/bold red]")
+                        raise subprocess.CalledProcessError(result.returncode, cmd)
+                        
+        except subprocess.TimeoutExpired:
+            run_results["status"] = "FAILURE"
+            run_results["status_detail"] = f"Timeout after {timeout}s"
+            run_results["test_duration"] = time.time() - test_start_time
+            self.rich_console.print(f"[bold red]✗ Script timed out after {timeout}s[/bold red]")
+            raise
+        except Exception as e:
+            run_results["status"] = "FAILURE"
+            run_results["status_detail"] = str(e)
+            run_results["test_duration"] = time.time() - test_start_time
+            raise
+        
+        return run_results
+
     def run_pre_post_script(
         self, model_docker: Docker, model_dir: str, pre_post: typing.List
     ) -> None:
@@ -813,6 +967,15 @@ def run_container(
             if merged_count > 0:
                 print(f"ℹ️  Merged {merged_count} environment variables from additional_context")
 
+        # Merge env_vars from model_info (models.json) into docker_env_vars
+        if "env_vars" in model_info and model_info["env_vars"]:
+            model_env_count = 0
+            for key, value in model_info["env_vars"].items():
+                self.context.ctx["docker_env_vars"][key] = str(value)
+                model_env_count += 1
+            if model_env_count > 0:
+                print(f"ℹ️  Merged {model_env_count} environment variables from model_info (models.json)")
+
         if "data" in model_info and model_info["data"] != "" and self.data:
             mount_datapaths = self.data.get_mountpaths(model_info["data"])
             model_dataenv = self.data.get_env(model_info["data"])
@@ -874,6 +1037,44 @@ def run_container(
 
         print(f"Docker options: {docker_options}")
 
+        # ========== CHECK FOR BAREMETAL LAUNCHERS ==========
+        # slurm_multi launchers run scripts directly on baremetal,
+        # not inside Docker. The script itself manages Docker containers via srun.
+        launcher = ""
+        
+        # Debug: Print all sources
+        print(f"🔍 Baremetal check - looking for launcher...")
+        print(f"   MAD_LAUNCHER_TYPE env: {os.environ.get('MAD_LAUNCHER_TYPE', '<not set>')}")
+        if self.additional_context:
+            distributed_config = self.additional_context.get("distributed", {})
+            launcher = distributed_config.get("launcher", "")
+            print(f"   additional_context.distributed.launcher: {launcher or '<not set>'}")
+        if not launcher and model_info.get("distributed"):
+            launcher = model_info["distributed"].get("launcher", "")
+            print(f"   model_info.distributed.launcher: {launcher or '<not set>'}")
+        if not launcher:
+            launcher = os.environ.get("MAD_LAUNCHER_TYPE", "")
+            print(f"   Fallback to MAD_LAUNCHER_TYPE: {launcher or '<not set>'}")
+        
+        print(f"   Final launcher detected: {launcher or '<none>'}")
+        
+        # Normalize launcher name (replace underscores with hyphens)
+        launcher_normalized = launcher.lower().replace("_", "-") if launcher else ""
+        
+        if launcher_normalized and launcher_normalized in [l.lower().replace("_", "-") for l in BAREMETAL_LAUNCHERS]:
+            self.rich_console.print(f"\n[bold cyan]🖥️ Running on BAREMETAL (launcher: {launcher})[/bold cyan]")
+            self.rich_console.print(f"[dim]Script will manage its own Docker containers via SLURM[/dim]")
+            return self._run_on_baremetal(
+                model_info=model_info,
+                build_info=build_info,
+                log_file_path=log_file_path,
+                timeout=timeout,
+                run_results=run_results,
+                pre_encapsulate_post_scripts=pre_encapsulate_post_scripts,
+                run_env=run_env,
+            )
+        # ========== END BAREMETAL CHECK ==========
+
         self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]")
         print(f"🏷️  Image: {docker_image}")
         print(f"📦 Container: {container_name}")
@@ -992,8 +1193,8 @@ def run_container(
 
                         # Prepare script execution
                         scripts_arg = model_info["scripts"]
-                        if scripts_arg.endswith(".sh"):
-                            # Shell script specified directly
+                        if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"):
+                            # Shell script specified directly (.sh or .slurm for SLURM batch scripts)
                             dir_path = os.path.dirname(scripts_arg)
                             script_name = "bash " + os.path.basename(scripts_arg)
                         elif scripts_arg.endswith(".py"):
diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py
index 3901d864..fe2d5ae5 100644
--- a/src/madengine/execution/docker_builder.py
+++ b/src/madengine/execution/docker_builder.py
@@ -416,6 +416,17 @@ def export_build_manifest(
                     "registry"
                 )
 
+        # Update built_models with registry image name for parallel pull in slurm_multi
+        # Map local image to registry image for env_vars
+        for image_name, build_info in self.built_images.items():
+            registry_image = build_info.get("registry_image")
+            if registry_image and image_name in self.built_models:
+                model_data = self.built_models[image_name]
+                if "env_vars" not in model_data:
+                    model_data["env_vars"] = {}
+                # Set DOCKER_IMAGE_NAME to registry image for parallel pull
+                model_data["env_vars"]["DOCKER_IMAGE_NAME"] = registry_image
+        
         manifest = {
             "built_images": self.built_images,
             "built_models": self.built_models,
diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py
index 49ee76c2..f3a04f7a 100644
--- a/src/madengine/orchestration/build_orchestrator.py
+++ b/src/madengine/orchestration/build_orchestrator.py
@@ -84,6 +84,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None):
             merged_context.update(additional_context)
 
         self.additional_context = merged_context
+        self._original_user_slurm_keys = set(merged_context.get("slurm", {}).keys())
         
         # Apply ConfigLoader to infer deploy type, validate, and apply defaults
         if self.additional_context:
@@ -178,6 +179,8 @@ def execute(
         clean_cache: bool = False,
         manifest_output: str = "build_manifest.json",
         batch_build_metadata: Optional[Dict] = None,
+        use_image: Optional[str] = None,
+        build_on_compute: bool = False,
     ) -> str:
         """
         Execute build workflow.
@@ -187,6 +190,8 @@ def execute(
             clean_cache: Whether to use --no-cache for Docker builds
             manifest_output: Output file for build manifest
             batch_build_metadata: Optional batch build metadata
+            use_image: Pre-built Docker image to use (skip Docker build)
+            build_on_compute: Build on SLURM compute node instead of login node
 
         Returns:
             Path to generated build_manifest.json
@@ -195,6 +200,52 @@ def execute(
             DiscoveryError: If model discovery fails
             BuildError: If Docker build fails
         """
+        # Handle pre-built image mode
+        if use_image:
+            # If use_image is "auto", resolve from model card
+            if use_image == "auto":
+                use_image = self._resolve_image_from_model_card()
+            
+            return self._execute_with_prebuilt_image(
+                use_image=use_image,
+                manifest_output=manifest_output,
+            )
+        
+        # Handle build-on-compute mode
+        if build_on_compute:
+            return self._execute_build_on_compute(
+                registry=registry,
+                clean_cache=clean_cache,
+                manifest_output=manifest_output,
+                batch_build_metadata=batch_build_metadata,
+            )
+        
+        # For normal build: check if slurm_multi launcher requires registry
+        # Discover models first to check launcher
+        discover_models = DiscoverModels(args=self.args)
+        discovered_models = discover_models.run()
+        
+        if discovered_models:
+            for model in discovered_models:
+                launcher = model.get("distributed", {}).get("launcher", "")
+                if launcher in ["slurm_multi", "slurm-multi"] and not registry:
+                    model_name = model.get("name", "unknown")
+                    raise ConfigurationError(
+                        f"slurm_multi launcher requires --registry or --use-image",
+                        context=create_error_context(
+                            operation="build",
+                            component="BuildOrchestrator",
+                            model=model_name,
+                            launcher=launcher,
+                        ),
+                        suggestions=[
+                            "Use --registry docker.io/myorg to push image (nodes will pull in parallel)",
+                            "Use --use-image to use a pre-built image from registry",
+                            "Use --build-on-compute --registry to build on compute and push",
+                            "For subsequent runs with same image, use: --use-image",
+                        ],
+                    )
+        
         self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]")
         self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]")
         self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]")
@@ -418,3 +469,678 @@ def _save_deployment_config(self, manifest_file: str):
             # Non-fatal - just warn
             self.rich_console.print(f"[yellow]Warning: Could not save deployment config: {e}[/yellow]")
 
+    def _execute_with_prebuilt_image(
+        self,
+        use_image: str,
+        manifest_output: str = "build_manifest.json",
+    ) -> str:
+        """
+        Generate manifest for a pre-built Docker image (skip Docker build).
+        
+        This is useful when using external images like:
+        - lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x
+        - nvcr.io/nvidia/pytorch:24.01-py3
+        
+        Args:
+            use_image: Pre-built Docker image name
+            manifest_output: Output file for build manifest
+            
+        Returns:
+            Path to generated build_manifest.json
+        """
+        self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]")
+        self.rich_console.print("[bold blue]🔨 BUILD PHASE (Pre-built Image Mode)[/bold blue]")
+        self.rich_console.print(f"[cyan]Using pre-built image: {use_image}[/cyan]")
+        self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n")
+
+        try:
+            # Step 1: Discover models
+            self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]")
+            discover_models = DiscoverModels(args=self.args)
+            models = discover_models.run()
+
+            if not models:
+                raise DiscoveryError(
+                    "No models discovered",
+                    context=create_error_context(
+                        operation="discover_models",
+                        component="BuildOrchestrator",
+                    ),
+                    suggestions=[
+                        "Check if models.json exists",
+                        "Verify --tags parameter is correct",
+                    ],
+                )
+
+            self.rich_console.print(f"[green]✓ Found {len(models)} models[/green]\n")
+
+            # Step 2: Generate manifest with pre-built image
+            self.rich_console.print("[bold cyan]📄 Generating manifest for pre-built image...[/bold cyan]")
+            
+            manifest = {
+                "built_images": {
+                    use_image: {
+                        "image_name": use_image,
+                        "docker_image": use_image,
+                        "dockerfile": "",
+                        "build_time": 0,
+                        "prebuilt": True,
+                    }
+                },
+                "built_models": {},
+                "context": self.context.ctx if hasattr(self.context, 'ctx') else {},
+                "credentials_required": [],
+                "summary": {
+                    "successful_builds": [],
+                    "failed_builds": [],
+                    "total_build_time": 0,
+                    "successful_pushes": [],
+                    "failed_pushes": [],
+                },
+            }
+
+            # Add each discovered model with the pre-built image
+            # Use the image name as the key (matches how madengine build does it)
+            for model in models:
+                model_name = model.get("name", "unknown")
+                model_distributed = model.get("distributed", {})
+                
+                # Merge DOCKER_IMAGE_NAME into env_vars for parallel pull in run phase
+                model_env_vars = model.get("env_vars", {}).copy()
+                model_env_vars["DOCKER_IMAGE_NAME"] = use_image
+                
+                # Use image name as key so slurm.py can find docker_image
+                manifest["built_models"][use_image] = {
+                    "name": model_name,
+                    "image": use_image,
+                    "docker_image": use_image,
+                    "dockerfile": model.get("dockerfile", ""),
+                    "scripts": model.get("scripts", ""),
+                    "data": model.get("data", ""),
+                    "n_gpus": model.get("n_gpus", "8"),
+                    "owner": model.get("owner", ""),
+                    "training_precision": model.get("training_precision", ""),
+                    "multiple_results": model.get("multiple_results", ""),
+                    "tags": model.get("tags", []),
+                    "timeout": model.get("timeout", -1),
+                    "args": model.get("args", ""),
+                    "slurm": model.get("slurm", {}),
+                    "distributed": model_distributed,
+                    "env_vars": model_env_vars,
+                    "prebuilt": True,
+                }
+                manifest["summary"]["successful_builds"].append(model_name)
+
+            # Save manifest
+            with open(manifest_output, "w") as f:
+                json.dump(manifest, f, indent=2)
+
+            # Save deployment config
+            self._save_deployment_config(manifest_output)
+            
+            # Merge model's distributed and slurm config into deployment_config
+            # This ensures launcher and slurm settings are in deployment_config even if not in additional-context
+            if models:
+                with open(manifest_output, "r") as f:
+                    saved_manifest = json.load(f)
+                
+                if "deployment_config" not in saved_manifest:
+                    saved_manifest["deployment_config"] = {}
+                
+                # Merge model's distributed config
+                model_distributed = models[0].get("distributed", {})
+                if model_distributed:
+                    if "distributed" not in saved_manifest["deployment_config"]:
+                        saved_manifest["deployment_config"]["distributed"] = {}
+                    
+                    # Copy launcher and other critical fields from model config
+                    for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg", "vllm_disagg"]:
+                        if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]:
+                            saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key]
+                
+                # Merge model's slurm config into deployment_config.slurm
+                # This enables run phase to auto-detect SLURM deployment without --additional-context
+                model_slurm = models[0].get("slurm", {})
+                if model_slurm:
+                    if "slurm" not in saved_manifest["deployment_config"]:
+                        saved_manifest["deployment_config"]["slurm"] = {}
+                    
+                    # Copy slurm settings from model config (model card fills in
+                    # values not explicitly set by --additional-context).
+                    # Use _original_user_slurm_keys (captured before ConfigLoader
+                    # applies defaults) so model card values override defaults
+                    # but user's explicit CLI values still win.
+                    for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir", "nodelist"]:
+                        if key in model_slurm and key not in self._original_user_slurm_keys:
+                            saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key]
+                
+                with open(manifest_output, "w") as f:
+                    json.dump(saved_manifest, f, indent=2)
+
+            self.rich_console.print(f"[green]✓ Generated manifest: {manifest_output}[/green]")
+            self.rich_console.print(f"  Pre-built image: {use_image}")
+            self.rich_console.print(f"  Models: {len(models)}")
+            self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n")
+
+            return manifest_output
+
+        except (DiscoveryError, BuildError):
+            raise
+        except Exception as e:
+            raise BuildError(
+                f"Failed to generate manifest for pre-built image: {e}",
+                context=create_error_context(
+                    operation="prebuilt_manifest",
+                    component="BuildOrchestrator",
+                ),
+            ) from e
+
+    def _resolve_image_from_model_card(self) -> str:
+        """
+        Resolve Docker image name from model card's DOCKER_IMAGE_NAME env var.
+        
+        This method discovers models and extracts the DOCKER_IMAGE_NAME from
+        env_vars. If multiple models have different images, uses the first
+        and prints a warning.
+        
+        Returns:
+            Docker image name from model card
+            
+        Raises:
+            ConfigurationError: If no DOCKER_IMAGE_NAME found in any model
+        """
+        self.rich_console.print("[bold cyan]🔍 Auto-detecting image from model card...[/bold cyan]")
+        
+        # Discover models to get their env_vars
+        discover_models = DiscoverModels(args=self.args)
+        models = discover_models.run()
+        
+        if not models:
+            raise ConfigurationError(
+                "No models discovered for image auto-detection",
+                context=create_error_context(
+                    operation="resolve_image",
+                    component="BuildOrchestrator",
+                ),
+                suggestions=[
+                    "Specify image name explicitly with --use-image <image>",
+                    "Check if models.json exists",
+                    "Verify --tags parameter is correct",
+                ],
+            )
+        
+        # Collect DOCKER_IMAGE_NAME from all models
+        images_found = {}
+        for model in models:
+            model_name = model.get("name", "unknown")
+            env_vars = model.get("env_vars", {})
+            docker_image = env_vars.get("DOCKER_IMAGE_NAME")
+            
+            if docker_image:
+                images_found[model_name] = docker_image
+        
+        if not images_found:
+            model_names = [m.get("name", "unknown") for m in models]
+            raise ConfigurationError(
+                "No DOCKER_IMAGE_NAME found in model card env_vars",
+                context=create_error_context(
+                    operation="resolve_image",
+                    component="BuildOrchestrator",
+                    model_names=model_names,
+                ),
+                suggestions=[
+                    "Add DOCKER_IMAGE_NAME to model's env_vars in models.json",
+                    "Specify image name explicitly with --use-image <image>",
+                    'Example: "env_vars": {"DOCKER_IMAGE_NAME": "myimage:tag"}',
+                ],
+            )
+        
+        # Use first model's image
+        first_model = list(images_found.keys())[0]
+        resolved_image = images_found[first_model]
+        
+        # Warn if multiple models have different images
+        unique_images = set(images_found.values())
+        if len(unique_images) > 1:
+            self.rich_console.print(
+                f"[yellow]⚠️  Warning: Multiple models have different DOCKER_IMAGE_NAME values:[/yellow]"
+            )
+            for model_name, image in images_found.items():
+                self.rich_console.print(f"   - {model_name}: {image}")
+            self.rich_console.print(
+                f"[yellow]   Using image from '{first_model}': {resolved_image}[/yellow]\n"
+            )
+        else:
+            self.rich_console.print(f"[green]✓ Auto-detected image: {resolved_image}[/green]\n")
+        
+        return resolved_image
+
+    def _execute_build_on_compute(
+        self,
+        registry: Optional[str] = None,
+        clean_cache: bool = False,
+        manifest_output: str = "build_manifest.json",
+        batch_build_metadata: Optional[Dict] = None,
+    ) -> str:
+        """
+        Execute Docker build on a SLURM compute node and push to registry.
+        
+        Build workflow:
+        1. Build on 1 compute node only
+        2. Push image to registry
+        3. Store registry image name in manifest
+        4. Run phase will pull image in parallel on all nodes
+        
+        Args:
+            registry: Registry to push images to (REQUIRED)
+            clean_cache: Whether to use --no-cache for Docker builds
+            manifest_output: Output file for build manifest
+            batch_build_metadata: Optional batch build metadata
+            
+        Returns:
+            Path to generated build_manifest.json
+        """
+        import subprocess
+        import os
+        import glob
+        
+        self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]")
+        self.rich_console.print("[bold blue]🔨 BUILD PHASE (Compute Node Mode)[/bold blue]")
+        self.rich_console.print("[cyan]Building on 1 compute node, pushing to registry...[/cyan]")
+        self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n")
+
+        # Discover models first to get SLURM config from model card
+        self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]")
+        discover_models = DiscoverModels(args=self.args)
+        models = discover_models.run()
+        
+        if not models:
+            raise DiscoveryError(
+                "No models discovered for build-on-compute",
+                context=create_error_context(
+                    operation="build_on_compute",
+                    component="BuildOrchestrator",
+                ),
+                suggestions=[
+                    "Check if models.json exists",
+                    "Verify --tags parameter is correct",
+                ],
+            )
+        
+        model = models[0]
+        model_name = model.get("name", "unknown")
+        self.rich_console.print(f"[green]✓ Found model: {model_name}[/green]\n")
+        
+        # Merge SLURM config: model card (base) + additional-context (override)
+        model_slurm_config = model.get("slurm", {})
+        context_slurm_config = self.additional_context.get("slurm", {})
+        
+        # Start with model card config, then override with command-line context
+        slurm_config = {**model_slurm_config, **context_slurm_config}
+        
+        self.rich_console.print("[bold cyan]📋 SLURM Configuration (merged):[/bold cyan]")
+        if model_slurm_config:
+            self.rich_console.print(f"  [dim]From model card:[/dim] {list(model_slurm_config.keys())}")
+        if context_slurm_config:
+            self.rich_console.print(f"  [dim]From --additional-context (overrides):[/dim] {list(context_slurm_config.keys())}")
+        
+        # Validate required fields
+        partition = slurm_config.get("partition")
+        if not partition:
+            raise ConfigurationError(
+                "Missing required SLURM field: partition",
+                context=create_error_context(
+                    operation="build_on_compute",
+                    component="BuildOrchestrator",
+                ),
+                suggestions=[
+                    'Add "partition" to model card\'s slurm section',
+                    'Or specify via --additional-context \'{"slurm": {"partition": "gpu"}}\'',
+                ],
+            )
+        
+        reservation = slurm_config.get("reservation", "")
+        time_limit = slurm_config.get("time", "02:00:00")
+        
+        self.rich_console.print(f"  Partition: {partition}")
+        self.rich_console.print(f"  Time limit: {time_limit}")
+        if reservation:
+            self.rich_console.print(f"  Reservation: {reservation}")
+        self.rich_console.print("")
+        
+        # Validate registry credentials
+        self.rich_console.print("[bold cyan]🔐 Registry Configuration:[/bold cyan]")
+        self.rich_console.print(f"  Registry: {registry}")
+        
+        # Check for credentials - either from environment or credential.json
+        dockerhub_user = os.environ.get("MAD_DOCKERHUB_USER", "")
+        dockerhub_password = os.environ.get("MAD_DOCKERHUB_PASSWORD", "")
+        
+        # Try to load from credential.json if env vars not set
+        credential_file = Path("credential.json")
+        if not dockerhub_user and credential_file.exists():
+            try:
+                with open(credential_file) as f:
+                    creds = json.load(f)
+                    dockerhub_creds = creds.get("dockerhub", {})
+                    dockerhub_user = dockerhub_creds.get("username", "")
+                    dockerhub_password = dockerhub_creds.get("password", "")
+                    if dockerhub_user:
+                        self.rich_console.print(f"  Credentials: Found in credential.json")
+            except (json.JSONDecodeError, IOError) as e:
+                self.rich_console.print(f"  [yellow]Warning: Could not read credential.json: {e}[/yellow]")
+        elif dockerhub_user:
+            self.rich_console.print(f"  Credentials: Found in environment (MAD_DOCKERHUB_USER)")
+        
+        # Determine if registry requires authentication
+        requires_auth = True
+        public_registries = ["docker.io", "ghcr.io", "gcr.io", "quay.io", "nvcr.io"]
+        registry_lower = registry.lower() if registry else ""
+        
+        # For docker.io pushes, authentication is always required
+        if any(pub_reg in registry_lower for pub_reg in public_registries):
+            if not dockerhub_user or not dockerhub_password:
+                raise ConfigurationError(
+                    f"Registry credentials required for pushing to {registry}",
+                    context=create_error_context(
+                        operation="build_on_compute",
+                        component="BuildOrchestrator",
+                        registry=registry,
+                    ),
+                    suggestions=[
+                        "Set environment variables: MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD",
+                        'Or create credential.json: {"dockerhub": {"username": "...", "password": "..."}}',
+                        "For Docker Hub, use a Personal Access Token (PAT) as password",
+                        f"Example: export MAD_DOCKERHUB_USER=myuser",
+                        f"Example: export MAD_DOCKERHUB_PASSWORD=dckr_pat_xxxxx",
+                    ],
+                )
+            self.rich_console.print(f"  Auth: Will login to registry before push")
+        else:
+            # Private/internal registry - may not need auth
+            self.rich_console.print(f"  Auth: Private registry (auth may not be required)")
+            requires_auth = dockerhub_user and dockerhub_password
+        
+        self.rich_console.print("")
+        
+        # Check if we're inside an existing allocation
+        inside_allocation = os.environ.get("SLURM_JOB_ID") is not None
+        existing_job_id = os.environ.get("SLURM_JOB_ID", "")
+        
+        # Find Dockerfile
+        dockerfile = model.get("dockerfile", "")
+        dockerfile_path = ""
+        dockerfile_patterns = [
+            f"{dockerfile}.ubuntu.amd.Dockerfile",
+            f"{dockerfile}.Dockerfile",
+            f"{dockerfile}",
+        ]
+        for pattern in dockerfile_patterns:
+            matches = glob.glob(pattern)
+            if matches:
+                dockerfile_path = matches[0]
+                break
+        
+        if not dockerfile_path:
+            raise ConfigurationError(
+                f"Dockerfile not found for model {model_name}",
+                context=create_error_context(
+                    operation="build_on_compute",
+                    component="BuildOrchestrator",
+                    dockerfile=dockerfile,
+                ),
+                suggestions=[
+                    f"Check if {dockerfile}.ubuntu.amd.Dockerfile exists",
+                    "Verify the dockerfile path in models.json",
+                ],
+            )
+        
+        # Generate image name for registry
+        dockerfile_basename = Path(dockerfile_path).name.replace(".Dockerfile", "").replace(".ubuntu.amd", "")
+        local_image_name = f"ci-{model_name}_{dockerfile_basename}"
+        
+        # Determine registry image name based on registry format
+        # docker.io/namespace/repo -> use model name as tag: docker.io/namespace/repo:model_name
+        # docker.io/namespace -> use model name as repo: docker.io/namespace/model_name:latest
+        registry_parts = registry.replace("docker.io/", "").split("/")
+        if len(registry_parts) >= 2:
+            # Registry already includes repo name (e.g., rocm/pytorch-private)
+            # Use model name as tag
+            registry_image_name = f"{registry}:{model_name}"
+            self.rich_console.print(f"  [dim]Registry format: namespace/repo -> using model name as tag[/dim]")
+        else:
+            # Registry is just namespace (e.g., myuser)
+            # Use model name as repo
+            registry_image_name = f"{registry}/{model_name}:latest"
+            self.rich_console.print(f"  [dim]Registry format: namespace -> using model name as repo[/dim]")
+        
+        self.rich_console.print("[bold cyan]🐳 Docker Configuration:[/bold cyan]")
+        self.rich_console.print(f"  Dockerfile: {dockerfile_path}")
+        self.rich_console.print(f"  Local image: {local_image_name}")
+        self.rich_console.print(f"  Registry image: {registry_image_name}")
+        self.rich_console.print("")
+        
+        # Determine registry host for docker login
+        registry_host = registry.split("/")[0] if "/" in registry else registry
+        
+        # Build script content - builds on 1 node, pushes to registry
+        build_script_content = f"""#!/bin/bash
+#SBATCH --job-name=madengine-build
+#SBATCH --partition={partition}
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --time={time_limit}
+{f'#SBATCH --reservation={reservation}' if reservation else ''}
+#SBATCH --output=madengine_build_%j.out
+#SBATCH --error=madengine_build_%j.err
+
+echo "============================================================"
+echo "=== MADENGINE BUILD ON COMPUTE NODE ==="
+echo "============================================================"
+echo ""
+echo "Job ID: $SLURM_JOB_ID"
+echo "Build Node: $(hostname)"
+echo "Working directory: $(pwd)"
+echo "Registry: {registry}"
+echo ""
+
+# Change to submission directory
+cd {Path.cwd().absolute()}
+
+# Step 0: Docker login for registry push
+echo "=== Step 0: Docker Registry Authentication ==="
+DOCKER_USER="${{MAD_DOCKERHUB_USER:-}}"
+DOCKER_PASS="${{MAD_DOCKERHUB_PASSWORD:-}}"
+
+# Try credential.json if env vars not set
+if [ -z "$DOCKER_USER" ] && [ -f "credential.json" ]; then
+    echo "Reading credentials from credential.json..."
+    DOCKER_USER=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('username', ''))" 2>/dev/null || echo "")
+    DOCKER_PASS=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('password', ''))" 2>/dev/null || echo "")
+fi
+
+if [ -n "$DOCKER_USER" ] && [ -n "$DOCKER_PASS" ]; then
+    echo "Logging in to registry as $DOCKER_USER..."
+    echo "$DOCKER_PASS" | docker login {registry_host} -u "$DOCKER_USER" --password-stdin
+    LOGIN_RC=$?
+    if [ $LOGIN_RC -ne 0 ]; then
+        echo ""
+        echo "❌ Docker login FAILED with exit code $LOGIN_RC"
+        echo ""
+        echo "Troubleshooting:"
+        echo "  - Verify MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD are correct"
+        echo "  - For Docker Hub, use a Personal Access Token (PAT) not your password"
+        echo "  - Check if the registry URL is correct: {registry_host}"
+        exit $LOGIN_RC
+    fi
+    echo "✅ Docker login SUCCESS"
+else
+    echo "No credentials found - assuming public registry or pre-authenticated"
+fi
+echo ""
+
+# Step 1: Build Docker image
+echo ""
+echo "=== Step 1: Building Docker image ==="
+echo "Dockerfile: {dockerfile_path}"
+echo "Local image name: {local_image_name}"
+echo ""
+
+docker build --network=host -t {local_image_name} {"--no-cache" if clean_cache else ""} --pull -f {dockerfile_path} ./docker
+BUILD_RC=$?
+
+if [ $BUILD_RC -ne 0 ]; then
+    echo ""
+    echo "❌ Docker build FAILED on $(hostname) with exit code $BUILD_RC"
+    exit $BUILD_RC
+fi
+
+echo ""
+echo "✅ Docker build SUCCESS on $(hostname)"
+echo ""
+
+# Step 2: Tag and push to registry
+echo "=== Step 2: Pushing to registry ==="
+echo "Tagging: {local_image_name} -> {registry_image_name}"
+docker tag {local_image_name} {registry_image_name}
+
+echo "Pushing: {registry_image_name}"
+docker push {registry_image_name}
+PUSH_RC=$?
+
+if [ $PUSH_RC -ne 0 ]; then
+    echo ""
+    echo "❌ Docker push FAILED with exit code $PUSH_RC"
+    echo ""
+    echo "Troubleshooting:"
+    echo "  - Check if you have push access to {registry}"
+    echo "  - Verify credentials are correct (MAD_DOCKERHUB_USER, MAD_DOCKERHUB_PASSWORD)"
+    echo "  - For Docker Hub, ensure the repository exists or you have create permissions"
+    exit $PUSH_RC
+fi
+
+echo ""
+echo "============================================================"
+echo "✅ BUILD AND PUSH COMPLETE"
+echo "============================================================"
+echo ""
+echo "Build Node: $(hostname)"
+echo "Registry Image: {registry_image_name}"
+echo ""
+echo "Run phase will pull this image in parallel on all nodes."
+echo "============================================================"
+
+exit 0
+"""
+        
+        build_script_path = Path("madengine_build_job.sh")
+        build_script_path.write_text(build_script_content)
+        build_script_path.chmod(0o755)
+        
+        if inside_allocation:
+            self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]")
+            cmd = ["srun", "-N1", "--ntasks=1", "bash", str(build_script_path)]
+        else:
+            self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]")
+            cmd = ["sbatch", "--wait", str(build_script_path)]
+        
+        self.rich_console.print(f"  Build script: {build_script_path}")
+        self.rich_console.print(f"  Command: {' '.join(cmd)}")
+        self.rich_console.print("")
+        
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=False,
+                text=True,
+            )
+            
+            if result.returncode != 0:
+                raise BuildError(
+                    f"Build on compute node failed with exit code {result.returncode}",
+                    context=create_error_context(
+                        operation="build_on_compute",
+                        component="BuildOrchestrator",
+                    ),
+                    suggestions=[
+                        "Check the build log files (madengine_build_*.out/err)",
+                        "Verify SLURM partition and reservation settings",
+                        "Ensure Docker is available on compute nodes",
+                        "Verify registry credentials are configured",
+                    ],
+                )
+            
+            # Generate manifest with registry image name
+            self.rich_console.print(f"\n[bold cyan]📄 Generating manifest...[/bold cyan]")
+            
+            manifest = {
+                "built_images": {
+                    registry_image_name: {
+                        "image_name": registry_image_name,
+                        "docker_image": registry_image_name,
+                        "local_image": local_image_name,
+                        "dockerfile": dockerfile_path,
+                        "build_time": 0,
+                        "built_on_compute": True,
+                        "registry": registry,
+                    }
+                },
+                "built_models": {
+                    registry_image_name: {
+                        "name": model_name,
+                        "image": registry_image_name,
+                        "docker_image": registry_image_name,
+                        "dockerfile": dockerfile_path,
+                        "scripts": model.get("scripts", ""),
+                        "data": model.get("data", ""),
+                        "n_gpus": model.get("n_gpus", "8"),
+                        "tags": model.get("tags", []),
+                        "slurm": slurm_config,
+                        "distributed": model.get("distributed", {}),
+                        "env_vars": {**model.get("env_vars", {}), "DOCKER_IMAGE_NAME": registry_image_name},
+                        "built_on_compute": True,
+                    }
+                },
+                "context": self.context.ctx if hasattr(self.context, 'ctx') else {},
+                "deployment_config": {
+                    "slurm": slurm_config,
+                    "distributed": model.get("distributed", {}),
+                },
+                "credentials_required": [],
+                "summary": {
+                    "successful_builds": [model_name],
+                    "failed_builds": [],
+                    "total_build_time": 0,
+                    "successful_pushes": [registry_image_name],
+                    "failed_pushes": [],
+                },
+            }
+            
+            with open(manifest_output, "w") as f:
+                json.dump(manifest, f, indent=2)
+            
+            self.rich_console.print(f"[green]✓ Build completed on compute node[/green]")
+            self.rich_console.print(f"[green]✓ Image pushed: {registry_image_name}[/green]")
+            self.rich_console.print(f"[green]✓ Manifest: {manifest_output}[/green]")
+            self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n")
+            
+            return manifest_output
+            
+        except subprocess.TimeoutExpired:
+            raise BuildError(
+                "Build on compute node timed out",
+                context=create_error_context(
+                    operation="build_on_compute",
+                    component="BuildOrchestrator",
+                ),
+            )
+        except (DiscoveryError, ConfigurationError, BuildError):
+            raise
+        except Exception as e:
+            raise BuildError(
+                f"Failed to build on compute node: {e}",
+                context=create_error_context(
+                    operation="build_on_compute",
+                    component="BuildOrchestrator",
+                ),
+            ) from e
\ No newline at end of file
diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py
index 42032fb1..7101a54b 100644
--- a/src/madengine/orchestration/run_orchestrator.py
+++ b/src/madengine/orchestration/run_orchestrator.py
@@ -217,10 +217,17 @@ def execute(
             if not self.additional_context:
                 self.additional_context = {}
             
-            # Merge deployment_config into additional_context (for deployment layer to use)
+            # Merge deployment_config into additional_context (for deployment layer to use).
+            # For dict-valued keys (slurm, k8s, etc.), deep-merge so manifest
+            # values fill in gaps while runtime --additional-context wins on conflicts.
             for key in ["slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]:
-                if key in deployment_config and key not in self.additional_context:
-                    self.additional_context[key] = deployment_config[key]
+                if key in deployment_config:
+                    if key not in self.additional_context:
+                        self.additional_context[key] = deployment_config[key]
+                    elif isinstance(deployment_config[key], dict) and isinstance(self.additional_context[key], dict):
+                        merged = dict(deployment_config[key])
+                        merged.update(self.additional_context[key])
+                        self.additional_context[key] = merged
             
             # Infer deployment target from config structure (Convention over Configuration)
             # No explicit "deploy" field needed - presence of k8s/slurm indicates deployment type
@@ -445,10 +452,17 @@ def _load_and_merge_manifest(self, manifest_file: str) -> str:
             # Merge deployment_config
             if "deployment_config" in manifest:
                 stored_config = manifest["deployment_config"]
-                # Runtime --additional-context overrides stored config
+                # Runtime --additional-context overrides stored config.
+                # For dict-valued keys, deep-merge so manifest values fill
+                # in gaps (e.g. nodes, time) while runtime values win on conflicts.
                 for key in ["deploy", "slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]:
                     if key in self.additional_context:
-                        stored_config[key] = self.additional_context[key]
+                        if key in stored_config and isinstance(stored_config[key], dict) and isinstance(self.additional_context[key], dict):
+                            merged = dict(stored_config[key])
+                            merged.update(self.additional_context[key])
+                            stored_config[key] = merged
+                        else:
+                            stored_config[key] = self.additional_context[key]
                 manifest["deployment_config"] = stored_config
             
             # Merge context (tools, pre_scripts, post_scripts, encapsulate_script)