From 8c4b959c71dcf343b3c8b90bbd0288fcb118cfe5 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Feb 2026 07:33:40 +0000 Subject: [PATCH 01/12] Updates to slurm launcher --- src/madengine/cli/commands/build.py | 18 + src/madengine/deployment/slurm.py | 344 +++++++++++++++++- src/madengine/execution/container_runner.py | 206 ++++++++++- .../orchestration/build_orchestrator.py | 283 ++++++++++++++ 4 files changed, 843 insertions(+), 8 deletions(-) diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index 99166a47..e7e93a3c 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -55,6 +55,20 @@ def build( "--batch-manifest", help="Input batch.json file for batch build mode" ), ] = None, + use_image: Annotated[ + Optional[str], + typer.Option( + "--use-image", + help="Skip Docker build and use pre-built image (e.g., lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x)" + ), + ] = None, + build_on_compute: Annotated[ + bool, + typer.Option( + "--build-on-compute", + help="Build Docker images on SLURM compute node instead of login node" + ), + ] = False, additional_context: Annotated[ str, typer.Option( @@ -183,6 +197,8 @@ def build( verbose=verbose, _separate_phases=True, batch_build_metadata=batch_build_metadata if batch_build_metadata else None, + use_image=use_image, + build_on_compute=build_on_compute, ) # Initialize orchestrator in build-only mode @@ -203,6 +219,8 @@ def build( clean_cache=clean_docker_cache, manifest_output=manifest_output, batch_build_metadata=batch_build_metadata, + use_image=use_image, + build_on_compute=build_on_compute, ) # Load build summary for display diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 577be47f..7ac6d6c4 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -249,6 +249,7 @@ def __init__(self, config: DeploymentConfig): self.gpus_per_node = self.slurm_config.get("gpus_per_node", 8) self.time_limit = self.slurm_config.get("time", "24:00:00") self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_output")) + self.reservation = self.slurm_config.get("reservation", None) # Setup Jinja2 template engine template_dir = Path(__file__).parent / "templates" / "slurm" @@ -261,6 +262,115 @@ def __init__(self, config: DeploymentConfig): # Generated script path self.script_path = None + # ========== OPTION 2: Detect existing SLURM allocation ========== + # If SLURM_JOB_ID exists, we're inside an salloc allocation + self.inside_allocation = os.environ.get("SLURM_JOB_ID") is not None + self.existing_job_id = os.environ.get("SLURM_JOB_ID", "") + self.allocation_nodes = self._get_allocation_node_count() + + if self.inside_allocation: + self.console.print( + f"[cyan]✓ Detected existing SLURM allocation: Job {self.existing_job_id}[/cyan]" + ) + self.console.print( + f" Allocation has {self.allocation_nodes} nodes available" + ) + + def _get_allocation_node_count(self) -> int: + """ + Get number of nodes in current SLURM allocation. + + Note: SLURM_NNODES reflects the current job step, not the full allocation. + We query the job directly using scontrol to get the actual node count. + """ + if not self.inside_allocation: + return 0 + + job_id = self.existing_job_id + + # Query the actual job's node count using scontrol (most accurate) + try: + result = subprocess.run( + ["scontrol", "show", "job", job_id], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + # Parse NumNodes=X from output + for line in result.stdout.split("\n"): + if "NumNodes=" in line: + # Format: "NumNodes=3 NumCPUs=..." + for part in line.split(): + if part.startswith("NumNodes="): + try: + return int(part.split("=")[1]) + except (ValueError, IndexError): + pass + except Exception: + pass + + # Fallback: Try SLURM_JOB_NUM_NODES (full job node count, if set) + job_num_nodes = os.environ.get("SLURM_JOB_NUM_NODES") + if job_num_nodes: + try: + return int(job_num_nodes) + except ValueError: + pass + + # Fallback: SLURM_NNODES (may be step-specific, not full allocation) + nnodes = os.environ.get("SLURM_NNODES") + if nnodes: + try: + return int(nnodes) + except ValueError: + pass + + # Last resort: count nodes in SLURM_NODELIST + nodelist = os.environ.get("SLURM_NODELIST") + if nodelist: + try: + result = subprocess.run( + ["scontrol", "show", "hostname", nodelist], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + return len(result.stdout.strip().split("\n")) + except Exception: + pass + + return 0 + + def _validate_allocation_nodes(self) -> tuple[bool, str]: + """ + Validate that existing allocation has enough nodes for the job. + + Returns: + Tuple of (is_valid, error_message) + """ + if not self.inside_allocation: + return True, "" + + requested_nodes = self.nodes + available_nodes = self.allocation_nodes + + if available_nodes < requested_nodes: + return False, ( + f"Insufficient nodes in current allocation. " + f"Requested: {requested_nodes}, Available: {available_nodes}. " + f"Either reduce nodes in config or use a larger allocation." + ) + + if available_nodes > requested_nodes: + self.console.print( + f"[yellow]⚠ Note: Using {requested_nodes} of {available_nodes} " + f"available nodes in allocation[/yellow]" + ) + + return True, "" + def validate(self) -> bool: """Validate SLURM commands are available locally.""" # Check required SLURM CLI tools @@ -362,11 +472,6 @@ def prepare(self) -> bool: """Generate sbatch script from template.""" # Validate environment BEFORE generating job scripts self.console.print("\n[bold]Validating submission environment...[/bold]") - if not self._validate_cli_availability(): - self.console.print( - "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" - ) - return False try: self.output_dir.mkdir(parents=True, exist_ok=True) @@ -379,6 +484,23 @@ def prepare(self) -> bool: model_key = model_keys[0] model_info = self.manifest["built_models"][model_key] + # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) + launcher_type = self.distributed_config.get("launcher", "torchrun") + launcher_normalized = launcher_type.lower().replace("_", "-") + + if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: + # For disagg launchers, generate simple wrapper script + # that runs the model's .slurm script directly on baremetal + self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") + return self._prepare_baremetal_script(model_info) + + # Standard flow: validate madengine availability for complex job template + if not self._validate_cli_availability(): + self.console.print( + "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" + ) + return False + # Prepare template context context = self._prepare_template_context(model_info) @@ -400,6 +522,115 @@ def prepare(self) -> bool: self.console.print(f"[red]✗ Failed to generate script: {e}[/red]") return False + def _prepare_baremetal_script(self, model_info: Dict) -> bool: + """ + Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). + + These launchers run the model's .slurm script directly on baremetal, + which then manages Docker containers via srun. No madengine wrapper needed. + """ + # Get the model's script path + model_script = model_info.get("scripts", "") + if not model_script: + self.console.print("[red]✗ No scripts defined in model_info[/red]") + return False + + # Get manifest directory (where the model script is relative to) + manifest_dir = Path(self.config.manifest_file).parent.absolute() + model_script_path = manifest_dir / model_script + + if not model_script_path.exists(): + self.console.print(f"[red]✗ Model script not found: {model_script_path}[/red]") + return False + + # Get environment variables + env_vars = {} + + # From model_info.env_vars + if "env_vars" in model_info: + env_vars.update(model_info["env_vars"]) + + # From additional_context.env_vars + if "env_vars" in self.config.additional_context: + env_vars.update(self.config.additional_context["env_vars"]) + + # From distributed config + sglang_disagg_config = self.distributed_config.get("sglang_disagg", {}) + if sglang_disagg_config: + env_vars["xP"] = str(sglang_disagg_config.get("prefill_nodes", 1)) + env_vars["yD"] = str(sglang_disagg_config.get("decode_nodes", 1)) + + # Get model args + model_args = model_info.get("args", "") + + # Generate simple wrapper script + # IMPORTANT: SBATCH directives MUST be at the top, right after #!/bin/bash + script_lines = [ + "#!/bin/bash", + f"#SBATCH --job-name=madengine-{model_info['name']}", + f"#SBATCH --output={self.output_dir}/madengine-{model_info['name']}_%j.out", + f"#SBATCH --error={self.output_dir}/madengine-{model_info['name']}_%j.err", + f"#SBATCH --partition={self.partition}", + f"#SBATCH --nodes={self.nodes}", + f"#SBATCH --ntasks={self.nodes}", + f"#SBATCH --gpus-per-node={self.gpus_per_node}", + f"#SBATCH --time={self.time_limit}", + "#SBATCH --exclusive", + ] + + # Add reservation if specified + if self.reservation: + script_lines.append(f"#SBATCH --reservation={self.reservation}") + + script_lines.extend([ + "", + f"# Baremetal launcher script for {model_info['name']}", + f"# Generated by madengine for sglang-disagg", + "", + "set -e", + "", + "# Environment variables", + ]) + + for key, value in env_vars.items(): + script_lines.append(f"export {key}=\"{value}\"") + + script_lines.append("") + script_lines.extend([ + "echo '=========================================='", + "echo 'Baremetal Launcher - SGLang Disaggregated'", + "echo '=========================================='", + f"echo 'Model: {model_info['name']}'", + f"echo 'Script: {model_script_path}'", + "echo 'SLURM_JOB_ID:' $SLURM_JOB_ID", + "echo 'SLURM_NNODES:' $SLURM_NNODES", + "echo 'SLURM_NODELIST:' $SLURM_NODELIST", + "echo ''", + "", + "# Change to script directory", + f"cd {model_script_path.parent}", + "", + "# Run the model script directly on baremetal", + f"echo 'Executing: bash {model_script_path.name} {model_args}'", + f"bash {model_script_path.name} {model_args}", + "", + "echo ''", + "echo 'Script completed.'", + ]) + + script_content = "\n".join(script_lines) + + # Save script + self.script_path = self.output_dir / f"madengine_{model_info['name']}.sh" + self.script_path.write_text(script_content) + self.script_path.chmod(0o755) + + self.console.print(f"[green]✓ Generated baremetal script: {self.script_path}[/green]") + self.console.print(f" Model script: {model_script_path}") + self.console.print(f" Environment: {len(env_vars)} variables") + + return True + def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: """Prepare context for Jinja2 template rendering.""" # Use hierarchical GPU resolution: runtime > deployment > model > default @@ -835,7 +1066,12 @@ def _generate_basic_env_command( # Model script should handle launcher invocation''' def deploy(self) -> DeploymentResult: - """Submit sbatch script to SLURM scheduler (locally).""" + """ + Deploy to SLURM - either via sbatch (new job) or bash (existing allocation). + + If SLURM_JOB_ID is set (inside salloc), runs script directly with bash. + Otherwise, submits a new job via sbatch. + """ if not self.script_path or not self.script_path.exists(): return DeploymentResult( status=DeploymentStatus.FAILED, @@ -843,6 +1079,85 @@ def deploy(self) -> DeploymentResult: message="Script not generated. Run prepare() first.", ) + # ========== BRANCH: Inside allocation vs new job ========== + if self.inside_allocation: + return self._run_inside_existing_allocation() + else: + return self._submit_new_job() + + def _run_inside_existing_allocation(self) -> DeploymentResult: + """ + Run script directly inside existing salloc allocation using bash. + + The script will use the nodes already allocated to the current job. + SLURM environment variables (SLURM_NODELIST, etc.) are inherited. + """ + # Validate node count before running + is_valid, error_msg = self._validate_allocation_nodes() + if not is_valid: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=error_msg, + ) + + self.console.print( + f"\n[bold cyan]Running inside existing SLURM allocation[/bold cyan]" + ) + self.console.print(f" Job ID: {self.existing_job_id}") + self.console.print(f" Using {self.nodes} of {self.allocation_nodes} allocated nodes") + self.console.print(f" GPUs per node: {self.gpus_per_node}") + self.console.print(f" Script: {self.script_path}") + self.console.print(f"\n[dim]Executing: bash {self.script_path}[/dim]\n") + + try: + # Run script directly with bash (synchronous, blocks until done) + # Don't capture output - let it stream directly to console + result = subprocess.run( + ["bash", str(self.script_path)], + timeout=self.config.timeout if self.config.timeout > 0 else None, + ) + + if result.returncode == 0: + self.console.print( + f"\n[green]✓ Script completed successfully in allocation {self.existing_job_id}[/green]" + ) + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=self.existing_job_id, + message=f"Completed inside existing allocation {self.existing_job_id}", + logs_path=str(self.output_dir), + ) + else: + self.console.print( + f"\n[red]✗ Script failed with exit code {result.returncode}[/red]" + ) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Script failed with exit code {result.returncode}", + logs_path=str(self.output_dir), + ) + + except subprocess.TimeoutExpired: + self.console.print( + f"\n[red]✗ Script timed out after {self.config.timeout}s[/red]" + ) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Script timed out after {self.config.timeout}s", + ) + except Exception as e: + self.console.print(f"\n[red]✗ Execution error: {e}[/red]") + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Execution error: {str(e)}", + ) + + def _submit_new_job(self) -> DeploymentResult: + """Submit new SLURM job via sbatch (original behavior).""" # ==================== PREFLIGHT NODE SELECTION ==================== # For multi-node jobs with Ray/vLLM, check for clean nodes first # to avoid OOM errors from stale processes @@ -927,6 +1242,15 @@ def deploy(self) -> DeploymentResult: def monitor(self, deployment_id: str) -> DeploymentResult: """Check SLURM job status (locally).""" + # If we ran inside an existing allocation, script already completed synchronously + # No need to poll - just return success (deploy() already handled the result) + if self.inside_allocation: + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Completed (ran inside existing allocation {deployment_id})", + ) + try: # Query job status using squeue (runs locally) result = subprocess.run( @@ -1242,6 +1566,14 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: def cleanup(self, deployment_id: str) -> bool: """Cancel SLURM job if still running (locally).""" + # CRITICAL: Never cancel an existing allocation we're running inside! + # The user's salloc session should not be terminated by madengine + if self.inside_allocation: + self.console.print( + f"[dim]Skipping cleanup - running inside existing allocation {deployment_id}[/dim]" + ) + return True + try: subprocess.run( ["scancel", deployment_id], capture_output=True, timeout=10 diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index ba011e81..1df9dd29 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -13,6 +13,16 @@ import typing import warnings import re +import subprocess + +# Launchers that should run directly on baremetal (not inside Docker) +# These launchers manage their own Docker containers via SLURM srun commands +BAREMETAL_LAUNCHERS = [ + "sglang-disagg", + "sglang_disagg", + "vllm-disagg", + "vllm_disagg", +] from rich.console import Console as RichConsole from contextlib import redirect_stdout, redirect_stderr from madengine.core.console import Console @@ -580,6 +590,151 @@ def apply_tools( else: print(f" Note: Command '{cmd}' already added by another tool, skipping duplicate.") + def _run_on_baremetal( + self, + model_info: typing.Dict, + build_info: typing.Dict, + log_file_path: str, + timeout: int, + run_results: typing.Dict, + pre_encapsulate_post_scripts: typing.Dict, + run_env: typing.Dict, + ) -> typing.Dict: + """ + Run script directly on baremetal (not inside Docker). + + Used for launchers like sglang-disagg that manage their own Docker containers + via SLURM srun commands. The script is executed directly on the node. + + Args: + model_info: Model configuration from manifest + build_info: Build information from manifest + log_file_path: Path to log file + timeout: Execution timeout in seconds + run_results: Dictionary to store run results + pre_encapsulate_post_scripts: Pre/post script configuration + run_env: Environment variables for the script + + Returns: + Dictionary with run results + """ + import shutil + + self.rich_console.print(f"[dim]{'='*80}[/dim]") + + # Prepare script path + scripts_arg = model_info["scripts"] + + # Get the current working directory (might be temp workspace) + cwd = os.getcwd() + print(f"📂 Current directory: {cwd}") + + if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"): + script_path = scripts_arg + script_name = os.path.basename(scripts_arg) + elif scripts_arg.endswith(".py"): + script_path = scripts_arg + script_name = os.path.basename(scripts_arg) + else: + # Directory specified - look for run.sh + script_path = os.path.join(scripts_arg, "run.sh") + script_name = "run.sh" + + # If script path is relative, make it absolute from cwd + if not os.path.isabs(script_path): + script_path = os.path.join(cwd, script_path) + + # Check script exists + if not os.path.exists(script_path): + print(f"⚠️ Script not found at: {script_path}") + # Try alternative locations + alt_path = os.path.join(cwd, os.path.basename(scripts_arg)) + if os.path.exists(alt_path): + script_path = alt_path + print(f"✓ Found at alternative location: {script_path}") + else: + raise FileNotFoundError(f"Script not found: {script_path}") + + script_dir = os.path.dirname(script_path) or cwd + print(f"📜 Script: {script_path}") + print(f"📁 Working directory: {script_dir}") + + # Prepare model arguments + model_args = self.context.ctx.get("model_args", model_info.get("args", "")) + print(f"📝 Arguments: {model_args}") + + # Build command + if script_path.endswith(".py"): + cmd = f"python3 {script_path} {model_args}" + else: + cmd = f"bash {script_path} {model_args}" + + print(f"🔧 Command: {cmd}") + + # Prepare environment + env = os.environ.copy() + env.update(run_env) + + # Add model-specific env vars from model_info + if "env_vars" in model_info and model_info["env_vars"]: + for key, value in model_info["env_vars"].items(): + env[key] = str(value) + print(f" ENV: {key}={value}") + + # Add env vars from additional_context + if self.additional_context and "env_vars" in self.additional_context: + for key, value in self.additional_context["env_vars"].items(): + env[key] = str(value) + + # Run script with logging + test_start_time = time.time() + self.rich_console.print("\n[bold blue]Running script on baremetal...[/bold blue]") + + try: + with open(log_file_path, mode="w", buffering=1) as outlog: + with redirect_stdout( + PythonicTee(outlog, self.live_output) + ), redirect_stderr(PythonicTee(outlog, self.live_output)): + print(f"⏰ Setting timeout to {timeout} seconds.") + print(f"🚀 Executing: {cmd}") + print(f"📂 Working directory: {script_dir}") + print(f"{'='*80}") + + result = subprocess.run( + cmd, + shell=True, + cwd=script_dir, + env=env, + timeout=timeout if timeout > 0 else None, + ) + + run_results["test_duration"] = time.time() - test_start_time + print(f"\n{'='*80}") + print(f"⏱️ Test Duration: {run_results['test_duration']:.2f} seconds") + + if result.returncode == 0: + run_results["status"] = "SUCCESS" + self.rich_console.print("[bold green]✓ Script completed successfully[/bold green]") + else: + run_results["status"] = "FAILURE" + run_results["status_detail"] = f"Exit code {result.returncode}" + self.rich_console.print(f"[bold red]✗ Script failed with exit code {result.returncode}[/bold red]") + raise subprocess.CalledProcessError(result.returncode, cmd) + + except subprocess.TimeoutExpired: + run_results["status"] = "FAILURE" + run_results["status_detail"] = f"Timeout after {timeout}s" + run_results["test_duration"] = time.time() - test_start_time + self.rich_console.print(f"[bold red]✗ Script timed out after {timeout}s[/bold red]") + raise + except Exception as e: + run_results["status"] = "FAILURE" + run_results["status_detail"] = str(e) + run_results["test_duration"] = time.time() - test_start_time + raise + + return run_results + def run_pre_post_script( self, model_docker: Docker, model_dir: str, pre_post: typing.List ) -> None: @@ -813,6 +968,15 @@ def run_container( if merged_count > 0: print(f"ℹ️ Merged {merged_count} environment variables from additional_context") + # Merge env_vars from model_info (models.json) into docker_env_vars + if "env_vars" in model_info and model_info["env_vars"]: + model_env_count = 0 + for key, value in model_info["env_vars"].items(): + self.context.ctx["docker_env_vars"][key] = str(value) + model_env_count += 1 + if model_env_count > 0: + print(f"ℹ️ Merged {model_env_count} environment variables from model_info (models.json)") + if "data" in model_info and model_info["data"] != "" and self.data: mount_datapaths = self.data.get_mountpaths(model_info["data"]) model_dataenv = self.data.get_env(model_info["data"]) @@ -874,6 +1038,44 @@ def run_container( print(f"Docker options: {docker_options}") + # ========== CHECK FOR BAREMETAL LAUNCHERS ========== + # Launchers like sglang-disagg run scripts directly on baremetal, + # not inside Docker. The script itself manages Docker containers via srun. + launcher = "" + + # Debug: Print all sources + print(f"🔍 Baremetal check - looking for launcher...") + print(f" MAD_LAUNCHER_TYPE env: {os.environ.get('MAD_LAUNCHER_TYPE', '')}") + if self.additional_context: + distributed_config = self.additional_context.get("distributed", {}) + launcher = distributed_config.get("launcher", "") + print(f" additional_context.distributed.launcher: {launcher or ''}") + if not launcher and model_info.get("distributed"): + launcher = model_info["distributed"].get("launcher", "") + print(f" model_info.distributed.launcher: {launcher or ''}") + if not launcher: + launcher = os.environ.get("MAD_LAUNCHER_TYPE", "") + print(f" Fallback to MAD_LAUNCHER_TYPE: {launcher or ''}") + + print(f" Final launcher detected: {launcher or ''}") + + # Normalize launcher name (replace underscores with hyphens) + launcher_normalized = launcher.lower().replace("_", "-") if launcher else "" + + if launcher_normalized and launcher_normalized in [l.lower().replace("_", "-") for l in BAREMETAL_LAUNCHERS]: + self.rich_console.print(f"\n[bold cyan]🖥️ Running on BAREMETAL (launcher: {launcher})[/bold cyan]") + self.rich_console.print(f"[dim]Script will manage its own Docker containers via SLURM[/dim]") + return self._run_on_baremetal( + model_info=model_info, + build_info=build_info, + log_file_path=log_file_path, + timeout=timeout, + run_results=run_results, + pre_encapsulate_post_scripts=pre_encapsulate_post_scripts, + run_env=run_env, + ) + # ========== END BAREMETAL CHECK ========== + self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]") print(f"🏷️ Image: {docker_image}") print(f"📦 Container: {container_name}") @@ -992,8 +1194,8 @@ def run_container( # Prepare script execution scripts_arg = model_info["scripts"] - if scripts_arg.endswith(".sh"): - # Shell script specified directly + if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"): + # Shell script specified directly (.sh or .slurm for SLURM batch scripts) dir_path = os.path.dirname(scripts_arg) script_name = "bash " + os.path.basename(scripts_arg) elif scripts_arg.endswith(".py"): diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 49ee76c2..7be37c6f 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -178,6 +178,8 @@ def execute( clean_cache: bool = False, manifest_output: str = "build_manifest.json", batch_build_metadata: Optional[Dict] = None, + use_image: Optional[str] = None, + build_on_compute: bool = False, ) -> str: """ Execute build workflow. @@ -187,6 +189,8 @@ def execute( clean_cache: Whether to use --no-cache for Docker builds manifest_output: Output file for build manifest batch_build_metadata: Optional batch build metadata + use_image: Pre-built Docker image to use (skip Docker build) + build_on_compute: Build on SLURM compute node instead of login node Returns: Path to generated build_manifest.json @@ -195,6 +199,21 @@ def execute( DiscoveryError: If model discovery fails BuildError: If Docker build fails """ + # Handle pre-built image mode + if use_image: + return self._execute_with_prebuilt_image( + use_image=use_image, + manifest_output=manifest_output, + ) + + # Handle build-on-compute mode + if build_on_compute: + return self._execute_build_on_compute( + registry=registry, + clean_cache=clean_cache, + manifest_output=manifest_output, + batch_build_metadata=batch_build_metadata, + ) self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]") self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]") @@ -418,3 +437,267 @@ def _save_deployment_config(self, manifest_file: str): # Non-fatal - just warn self.rich_console.print(f"[yellow]Warning: Could not save deployment config: {e}[/yellow]") + def _execute_with_prebuilt_image( + self, + use_image: str, + manifest_output: str = "build_manifest.json", + ) -> str: + """ + Generate manifest for a pre-built Docker image (skip Docker build). + + This is useful when using external images like: + - lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x + - nvcr.io/nvidia/pytorch:24.01-py3 + + Args: + use_image: Pre-built Docker image name + manifest_output: Output file for build manifest + + Returns: + Path to generated build_manifest.json + """ + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🔨 BUILD PHASE (Pre-built Image Mode)[/bold blue]") + self.rich_console.print(f"[cyan]Using pre-built image: {use_image}[/cyan]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + try: + # Step 1: Discover models + self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]") + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered", + context=create_error_context( + operation="discover_models", + component="BuildOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + self.rich_console.print(f"[green]✓ Found {len(models)} models[/green]\n") + + # Step 2: Generate manifest with pre-built image + self.rich_console.print("[bold cyan]📄 Generating manifest for pre-built image...[/bold cyan]") + + manifest = { + "built_images": { + use_image: { + "image_name": use_image, + "dockerfile": "", + "build_time": 0, + "prebuilt": True, + } + }, + "built_models": {}, + "context": self.context.ctx if hasattr(self.context, 'ctx') else {}, + "credentials_required": [], + "summary": { + "successful_builds": [], + "failed_builds": [], + "total_build_time": 0, + "successful_pushes": [], + "failed_pushes": [], + }, + } + + # Add each discovered model with the pre-built image + for model in models: + model_name = model.get("name", "unknown") + manifest["built_models"][model_name] = { + "name": model_name, + "image": use_image, + "dockerfile": model.get("dockerfile", ""), + "scripts": model.get("scripts", ""), + "data": model.get("data", ""), + "n_gpus": model.get("n_gpus", "8"), + "owner": model.get("owner", ""), + "training_precision": model.get("training_precision", ""), + "multiple_results": model.get("multiple_results", ""), + "tags": model.get("tags", []), + "timeout": model.get("timeout", -1), + "args": model.get("args", ""), + "slurm": model.get("slurm", {}), + "distributed": model.get("distributed", {}), + "env_vars": model.get("env_vars", {}), + "prebuilt": True, + } + manifest["summary"]["successful_builds"].append(model_name) + + # Save manifest + with open(manifest_output, "w") as f: + json.dump(manifest, f, indent=2) + + # Save deployment config + self._save_deployment_config(manifest_output) + + self.rich_console.print(f"[green]✓ Generated manifest: {manifest_output}[/green]") + self.rich_console.print(f" Pre-built image: {use_image}") + self.rich_console.print(f" Models: {len(models)}") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + return manifest_output + + except (DiscoveryError, BuildError): + raise + except Exception as e: + raise BuildError( + f"Failed to generate manifest for pre-built image: {e}", + context=create_error_context( + operation="prebuilt_manifest", + component="BuildOrchestrator", + ), + ) from e + + def _execute_build_on_compute( + self, + registry: Optional[str] = None, + clean_cache: bool = False, + manifest_output: str = "build_manifest.json", + batch_build_metadata: Optional[Dict] = None, + ) -> str: + """ + Execute Docker build on a SLURM compute node instead of login node. + + This submits a SLURM job that runs the Docker build on a compute node, + which is useful when: + - Login node has limited disk space + - Login node shouldn't run heavy workloads + - Compute nodes have faster storage/network + + Args: + registry: Optional registry to push images to + clean_cache: Whether to use --no-cache for Docker builds + manifest_output: Output file for build manifest + batch_build_metadata: Optional batch build metadata + + Returns: + Path to generated build_manifest.json + """ + import subprocess + import os + + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🔨 BUILD PHASE (Compute Node Mode)[/bold blue]") + self.rich_console.print("[cyan]Building on SLURM compute node...[/cyan]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + # Check if we're inside an existing allocation + inside_allocation = os.environ.get("SLURM_JOB_ID") is not None + existing_job_id = os.environ.get("SLURM_JOB_ID", "") + + # Get SLURM config from additional_context + slurm_config = self.additional_context.get("slurm", {}) + partition = slurm_config.get("partition", "gpu") + reservation = slurm_config.get("reservation", "") + time_limit = slurm_config.get("time", "02:00:00") + + # Build the madengine build command (without --build-on-compute to avoid recursion) + tags = getattr(self.args, 'tags', []) + tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" + + additional_context_str = "" + if self.additional_context: + # Serialize additional context for the compute node + import json + ctx_json = json.dumps(self.additional_context) + additional_context_str = f"--additional-context '{ctx_json}'" + + build_cmd = f"madengine build {tags_str} {additional_context_str} --manifest-output {manifest_output}" + if registry: + build_cmd += f" --registry {registry}" + if clean_cache: + build_cmd += " --clean-docker-cache" + + if inside_allocation: + # Run build on compute node via srun + self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") + cmd = ["srun", "-N1", "--ntasks=1", "bash", "-c", build_cmd] + else: + # Generate and submit build script + self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") + + build_script_content = f"""#!/bin/bash +#SBATCH --job-name=madengine-build +#SBATCH --partition={partition} +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --time={time_limit} +{f'#SBATCH --reservation={reservation}' if reservation else ''} +#SBATCH --output=madengine_build_%j.out +#SBATCH --error=madengine_build_%j.err + +echo "=== Building on compute node: $(hostname) ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Build command: {build_cmd}" +echo "" + +# Activate virtual environment if available +if [ -f "venv/bin/activate" ]; then + source venv/bin/activate +fi + +# Run the build +{build_cmd} + +echo "" +echo "=== Build completed ===" +""" + build_script_path = Path("madengine_build_job.sh") + build_script_path.write_text(build_script_content) + build_script_path.chmod(0o755) + + self.rich_console.print(f" Build script: {build_script_path}") + cmd = ["sbatch", "--wait", str(build_script_path)] + + # Execute the build + self.rich_console.print(f" Command: {' '.join(cmd)}") + self.rich_console.print("") + + try: + result = subprocess.run( + cmd, + capture_output=False, # Let output flow to console + text=True, + ) + + if result.returncode != 0: + raise BuildError( + f"Build on compute node failed with exit code {result.returncode}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + "Check the build log files (madengine_build_*.out/err)", + "Verify SLURM partition and reservation settings", + "Ensure Docker is available on compute nodes", + ], + ) + + self.rich_console.print(f"[green]✓ Build completed on compute node[/green]") + self.rich_console.print(f"[green]✓ Manifest: {manifest_output}[/green]") + return manifest_output + + except subprocess.TimeoutExpired: + raise BuildError( + "Build on compute node timed out", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + ) + except Exception as e: + raise BuildError( + f"Failed to build on compute node: {e}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + ) from e + From 05965278ea40e2975a0b474b13f92996374ad4d7 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Mon, 9 Feb 2026 22:44:52 +0000 Subject: [PATCH 02/12] Fix manifest generation and reservation handling for sglang-disagg - Add docker_image field to built_images and built_models in --use-image mode - Merge model's distributed.launcher into deployment_config for proper detection - Make reservation optional (empty string is ignored) - Pass reservation to SlurmNodeSelector for health checks Co-authored-by: Cursor --- src/madengine/deployment/slurm.py | 36 +++- .../deployment/slurm_node_selector.py | 47 +++-- .../orchestration/build_orchestrator.py | 166 +++++++++++++++--- 3 files changed, 206 insertions(+), 43 deletions(-) diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 7ac6d6c4..fccd39f8 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -485,14 +485,17 @@ def prepare(self) -> bool: model_info = self.manifest["built_models"][model_key] # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) - launcher_type = self.distributed_config.get("launcher", "torchrun") + # Priority: model_info.distributed.launcher > additional_context.distributed.launcher + model_distributed = model_info.get("distributed", {}) + launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun") launcher_normalized = launcher_type.lower().replace("_", "-") if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: # For disagg launchers, generate simple wrapper script # that runs the model's .slurm script directly on baremetal self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") - return self._prepare_baremetal_script(model_info) + # Pass model_key as docker_image_name (for manifests, the key IS the built image name) + return self._prepare_baremetal_script(model_info, docker_image_name=model_key) # Standard flow: validate madengine availability for complex job template if not self._validate_cli_availability(): @@ -522,12 +525,16 @@ def prepare(self) -> bool: self.console.print(f"[red]✗ Failed to generate script: {e}[/red]") return False - def _prepare_baremetal_script(self, model_info: Dict) -> bool: + def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool: """ Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). These launchers run the model's .slurm script directly on baremetal, which then manages Docker containers via srun. No madengine wrapper needed. + + Args: + model_info: Model configuration from manifest + docker_image_name: The built Docker image name from manifest key """ # Get the model's script path model_script = model_info.get("scripts", "") @@ -554,12 +561,30 @@ def _prepare_baremetal_script(self, model_info: Dict) -> bool: if "env_vars" in self.config.additional_context: env_vars.update(self.config.additional_context["env_vars"]) - # From distributed config - sglang_disagg_config = self.distributed_config.get("sglang_disagg", {}) + # From distributed config (model's distributed section) + model_distributed = model_info.get("distributed", {}) + sglang_disagg_config = model_distributed.get("sglang_disagg", {}) or self.distributed_config.get("sglang_disagg", {}) if sglang_disagg_config: env_vars["xP"] = str(sglang_disagg_config.get("prefill_nodes", 1)) env_vars["yD"] = str(sglang_disagg_config.get("decode_nodes", 1)) + # Override DOCKER_IMAGE_NAME with the built image from manifest + # This ensures the run uses the freshly built image, not the base image + # Priority: docker_image_name param > model_info.docker_image > env_vars.DOCKER_IMAGE_NAME + if docker_image_name and docker_image_name.startswith("ci-"): + # The manifest key IS the built image name for madengine-built images + self.console.print(f"[cyan]Using built Docker image: {docker_image_name}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = docker_image_name + elif "docker_image" in model_info: + built_image = model_info["docker_image"] + self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = built_image + elif "image" in model_info: + # Fallback to 'image' field + built_image = model_info["image"] + self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = built_image + # Get model args model_args = model_info.get("args", "") @@ -1170,6 +1195,7 @@ def _submit_new_job(self) -> DeploymentResult: console=self.console, auto_cleanup=auto_cleanup, verbose=self.slurm_config.get("verbose_node_check", False), + reservation=self.reservation, ) # Select clean nodes and get updated exclude list diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py index b52f53d7..a494ff76 100644 --- a/src/madengine/deployment/slurm_node_selector.py +++ b/src/madengine/deployment/slurm_node_selector.py @@ -71,6 +71,7 @@ def __init__( auto_cleanup: bool = False, verbose: bool = False, timeout: int = 30, + reservation: Optional[str] = None, ): """ Initialize node selector. @@ -80,11 +81,13 @@ def __init__( auto_cleanup: Automatically clean dirty nodes verbose: Enable verbose logging timeout: Timeout for srun commands (seconds) + reservation: SLURM reservation name for reserved nodes """ self.console = console or Console() self.auto_cleanup = auto_cleanup self.verbose = verbose self.timeout = timeout + self.reservation = reservation def get_candidate_nodes( self, @@ -199,16 +202,20 @@ def check_node_health(self, node: str) -> NodeStatus: try: # Use srun to execute check on specific node + srun_cmd = [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", # Allow overlap with running jobs + "--quiet", + ] + if self.reservation: + srun_cmd.append(f"--reservation={self.reservation}") + srun_cmd.extend(["bash", "-c", check_script]) + result = subprocess.run( - [ - "srun", - f"--nodelist={node}", - "--ntasks=1", - "--time=00:01:00", - "--overlap", # Allow overlap with running jobs - "--quiet", - "bash", "-c", check_script - ], + srun_cmd, capture_output=True, text=True, timeout=self.timeout, @@ -309,16 +316,20 @@ def cleanup_node(self, node: str) -> bool: """ try: + srun_cmd = [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", + "--quiet", + ] + if self.reservation: + srun_cmd.append(f"--reservation={self.reservation}") + srun_cmd.extend(["bash", "-c", cleanup_script]) + result = subprocess.run( - [ - "srun", - f"--nodelist={node}", - "--ntasks=1", - "--time=00:01:00", - "--overlap", - "--quiet", - "bash", "-c", cleanup_script - ], + srun_cmd, capture_output=True, text=True, timeout=self.timeout, diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 7be37c6f..808e865e 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -489,6 +489,7 @@ def _execute_with_prebuilt_image( "built_images": { use_image: { "image_name": use_image, + "docker_image": use_image, "dockerfile": "", "build_time": 0, "prebuilt": True, @@ -507,11 +508,16 @@ def _execute_with_prebuilt_image( } # Add each discovered model with the pre-built image + # Use the image name as the key (matches how madengine build does it) for model in models: model_name = model.get("name", "unknown") - manifest["built_models"][model_name] = { + model_distributed = model.get("distributed", {}) + + # Use image name as key so slurm.py can find docker_image + manifest["built_models"][use_image] = { "name": model_name, "image": use_image, + "docker_image": use_image, "dockerfile": model.get("dockerfile", ""), "scripts": model.get("scripts", ""), "data": model.get("data", ""), @@ -523,7 +529,7 @@ def _execute_with_prebuilt_image( "timeout": model.get("timeout", -1), "args": model.get("args", ""), "slurm": model.get("slurm", {}), - "distributed": model.get("distributed", {}), + "distributed": model_distributed, "env_vars": model.get("env_vars", {}), "prebuilt": True, } @@ -535,6 +541,28 @@ def _execute_with_prebuilt_image( # Save deployment config self._save_deployment_config(manifest_output) + + # Merge model's distributed config (especially launcher) into deployment_config + # This ensures sglang-disagg launcher is in deployment_config even if not in additional-context + if models and models[0].get("distributed"): + with open(manifest_output, "r") as f: + saved_manifest = json.load(f) + + model_distributed = models[0].get("distributed", {}) + if "deployment_config" not in saved_manifest: + saved_manifest["deployment_config"] = {} + + # Merge model's distributed into deployment_config.distributed + if "distributed" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["distributed"] = {} + + # Copy launcher and other critical fields from model config + for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg"]: + if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: + saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] + + with open(manifest_output, "w") as f: + json.dump(saved_manifest, f, indent=2) self.rich_console.print(f"[green]✓ Generated manifest: {manifest_output}[/green]") self.rich_console.print(f" Pre-built image: {use_image}") @@ -596,23 +624,36 @@ def _execute_build_on_compute( partition = slurm_config.get("partition", "gpu") reservation = slurm_config.get("reservation", "") time_limit = slurm_config.get("time", "02:00:00") + # Get number of nodes - build on ALL nodes so image is available everywhere + nodes = slurm_config.get("nodes", 1) # Build the madengine build command (without --build-on-compute to avoid recursion) tags = getattr(self.args, 'tags', []) tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" + # Write additional context to a file to avoid shell quoting issues + context_file_path = None additional_context_str = "" if self.additional_context: - # Serialize additional context for the compute node import json - ctx_json = json.dumps(self.additional_context) - additional_context_str = f"--additional-context '{ctx_json}'" - - build_cmd = f"madengine build {tags_str} {additional_context_str} --manifest-output {manifest_output}" + context_file_path = Path("madengine_build_context.json") + with open(context_file_path, 'w') as f: + json.dump(self.additional_context, f) + self.rich_console.print(f" Context file: {context_file_path}") + + # Base build command + build_cmd_parts = ["madengine", "build"] + if tags_str: + build_cmd_parts.extend(tags_str.split()) + if context_file_path: + build_cmd_parts.extend(["--additional-context-file", str(context_file_path)]) + build_cmd_parts.extend(["--manifest-output", manifest_output]) if registry: - build_cmd += f" --registry {registry}" + build_cmd_parts.extend(["--registry", registry]) if clean_cache: - build_cmd += " --clean-docker-cache" + build_cmd_parts.append("--clean-docker-cache") + + build_cmd = " ".join(build_cmd_parts) if inside_allocation: # Run build on compute node via srun @@ -622,31 +663,117 @@ def _execute_build_on_compute( # Generate and submit build script self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") + # Get absolute path for context file + abs_context_file = str(context_file_path.absolute()) if context_file_path else "" + abs_manifest_output = str(Path(manifest_output).absolute()) + + # Rebuild command with absolute paths for sbatch + build_cmd_abs = f"madengine build {tags_str}" + if abs_context_file: + build_cmd_abs += f" --additional-context-file {abs_context_file}" + build_cmd_abs += f" --manifest-output {abs_manifest_output}" + if registry: + build_cmd_abs += f" --registry {registry}" + if clean_cache: + build_cmd_abs += " --clean-docker-cache" + + # Discover models to get Dockerfile path + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + dockerfile_path = "" + dockerfile_name = "" + if models: + dockerfile = models[0].get("dockerfile", "") + # Find the actual Dockerfile + import glob + dockerfile_patterns = [ + f"{dockerfile}.ubuntu.amd.Dockerfile", + f"{dockerfile}.Dockerfile", + f"{dockerfile}", + ] + for pattern in dockerfile_patterns: + matches = glob.glob(pattern) + if matches: + dockerfile_path = matches[0] + dockerfile_name = Path(dockerfile_path).name + break + + self.rich_console.print(f" Nodes: {nodes} (building on all nodes)") + if dockerfile_path: + self.rich_console.print(f" Dockerfile: {dockerfile_path}") + build_script_content = f"""#!/bin/bash #SBATCH --job-name=madengine-build #SBATCH --partition={partition} -#SBATCH --nodes=1 -#SBATCH --ntasks=1 +#SBATCH --nodes={nodes} +#SBATCH --ntasks={nodes} #SBATCH --time={time_limit} {f'#SBATCH --reservation={reservation}' if reservation else ''} #SBATCH --output=madengine_build_%j.out #SBATCH --error=madengine_build_%j.err -echo "=== Building on compute node: $(hostname) ===" +echo "=== Building on compute nodes ===" echo "Job ID: $SLURM_JOB_ID" -echo "Build command: {build_cmd}" +echo "Nodes: $SLURM_NNODES" +echo "Node list: $SLURM_NODELIST" +echo "Working directory: $(pwd)" echo "" +# Change to submission directory +cd {Path.cwd().absolute()} + # Activate virtual environment if available -if [ -f "venv/bin/activate" ]; then - source venv/bin/activate +if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then + source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} + echo "Activated virtual environment" +fi# Step 1: Build Docker image on ALL nodes in parallel +echo "" +echo "=== Building Docker image on all $SLURM_NNODES nodes ===" +DOCKERFILE="{dockerfile_path}" +if [ -n "$DOCKERFILE" ] && [ -f "$DOCKERFILE" ]; then + # Get the image name - must match exactly what madengine generates + # Format: ci-_ + IMAGE_NAME=$(basename $DOCKERFILE .Dockerfile) + FULL_IMAGE_NAME="ci-{models[0].get('name', 'model') if models else 'model'}_$IMAGE_NAME" + + echo "Dockerfile: $DOCKERFILE" + echo "Image name: $FULL_IMAGE_NAME" + + # Build on all nodes in parallel using srun + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c " + echo \\\"[\\$(hostname)] Building Docker image...\\\" + cd {Path.cwd().absolute()} + docker build --network=host -t $FULL_IMAGE_NAME --pull -f $DOCKERFILE ./docker + BUILD_RC=\\$? + if [ \\$BUILD_RC -eq 0 ]; then + echo \\\"[\\$(hostname)] Docker build SUCCESS\\\" + else + echo \\\"[\\$(hostname)] Docker build FAILED with exit code \\$BUILD_RC\\\" + fi + exit \\$BUILD_RC + " + DOCKER_BUILD_EXIT=$? + + if [ $DOCKER_BUILD_EXIT -ne 0 ]; then + echo "Docker build failed on one or more nodes" + exit $DOCKER_BUILD_EXIT + fi + echo "" + echo "=== Docker image built on all nodes ===" fi -# Run the build -{build_cmd} +# Step 2: Run madengine build on rank 0 to generate manifest +echo "" +echo "=== Generating build manifest ===" +echo "Build command: {build_cmd_abs}" +echo "" + +{build_cmd_abs} +BUILD_EXIT=$? echo "" -echo "=== Build completed ===" +echo "=== Build completed with exit code: $BUILD_EXIT ===" +exit $BUILD_EXIT """ build_script_path = Path("madengine_build_job.sh") build_script_path.write_text(build_script_content) @@ -699,5 +826,4 @@ def _execute_build_on_compute( operation="build_on_compute", component="BuildOrchestrator", ), - ) from e - + ) from e \ No newline at end of file From e7db0be9ade8b4dfc161041543222516fea2d354 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Feb 2026 04:57:41 +0000 Subject: [PATCH 03/12] Fix syntax error in build-on-compute sbatch script generation Add missing newline after 'fi' statement to prevent syntax error in generated sbatch script when using --build-on-compute option. Co-authored-by: Cursor --- src/madengine/orchestration/build_orchestrator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 808e865e..76830240 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -726,7 +726,9 @@ def _execute_build_on_compute( if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} echo "Activated virtual environment" -fi# Step 1: Build Docker image on ALL nodes in parallel +fi + +# Step 1: Build Docker image on ALL nodes in parallel echo "" echo "=== Building Docker image on all $SLURM_NNODES nodes ===" DOCKERFILE="{dockerfile_path}" From 2aec5ed5d620eb866580c478c0c1afa40da0e032 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 4 Mar 2026 07:04:32 +0000 Subject: [PATCH 04/12] Rename sglang-disagg/vllm-disagg launchers to slurm_multi Replace legacy launcher names with unified slurm_multi: - Remove sglang-disagg, vllm-disagg from VALID_LAUNCHERS - Add slurm_multi as the only baremetal multi-node launcher - Update deployment logic in slurm.py and kubernetes.py - Rename example config files from sglang-disagg-* to slurm-multi-* - Update docs/launchers.md references Breaking change: sglang-disagg and vllm-disagg are no longer valid launcher names. Use slurm_multi instead. Made-with: Cursor --- docs/launchers.md | 18 ++++---- ...lit.json => slurm-multi-custom-split.json} | 2 +- ...json => slurm-multi-multi-node-basic.json} | 2 +- ...-minimal.json => slurm-multi-minimal.json} | 2 +- ...lit.json => slurm-multi-custom-split.json} | 2 +- ...-node.json => slurm-multi-multi-node.json} | 2 +- ...-minimal.json => slurm-multi-minimal.json} | 2 +- src/madengine/deployment/kubernetes.py | 28 ++++++++----- src/madengine/deployment/slurm.py | 41 +++++++++++-------- src/madengine/execution/container_runner.py | 13 +++--- 10 files changed, 63 insertions(+), 49 deletions(-) rename examples/k8s-configs/basic/{sglang-disagg-custom-split.json => slurm-multi-custom-split.json} (97%) rename examples/k8s-configs/basic/{sglang-disagg-multi-node-basic.json => slurm-multi-multi-node-basic.json} (97%) rename examples/k8s-configs/minimal/{sglang-disagg-minimal.json => slurm-multi-minimal.json} (92%) rename examples/slurm-configs/basic/{sglang-disagg-custom-split.json => slurm-multi-custom-split.json} (98%) rename examples/slurm-configs/basic/{sglang-disagg-multi-node.json => slurm-multi-multi-node.json} (98%) rename examples/slurm-configs/minimal/{sglang-disagg-minimal.json => slurm-multi-minimal.json} (93%) diff --git a/docs/launchers.md b/docs/launchers.md index b4ae7d34..58e725ae 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -364,7 +364,7 @@ SGLang Disaggregated separates inference into specialized node pools: ```json { "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "sglang_disagg": { @@ -403,7 +403,7 @@ Override automatic split based on workload characteristics: ```json { "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "sglang_disagg": { @@ -457,12 +457,12 @@ SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..." ``` **Examples**: -- K8s Minimal: `examples/k8s-configs/minimal/sglang-disagg-minimal.json` -- K8s Basic: `examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json` -- K8s Custom: `examples/k8s-configs/basic/sglang-disagg-custom-split.json` -- SLURM Minimal: `examples/slurm-configs/minimal/sglang-disagg-minimal.json` -- SLURM Basic: `examples/slurm-configs/basic/sglang-disagg-multi-node.json` -- SLURM Custom: `examples/slurm-configs/basic/sglang-disagg-custom-split.json` +- K8s Minimal: `examples/k8s-configs/minimal/slurm_multi-minimal.json` +- K8s Basic: `examples/k8s-configs/basic/slurm_multi-multi-node-basic.json` +- K8s Custom: `examples/k8s-configs/basic/slurm_multi-custom-split.json` +- SLURM Minimal: `examples/slurm-configs/minimal/slurm_multi-minimal.json` +- SLURM Basic: `examples/slurm-configs/basic/slurm_multi-multi-node.json` +- SLURM Custom: `examples/slurm-configs/basic/slurm_multi-custom-split.json` **Comparison: SGLang vs SGLang Disaggregated**: @@ -681,7 +681,7 @@ SGLANG_NODE_RANK=${SLURM_PROCID} ```bash Error: Unknown launcher type 'xyz' ``` -Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `sglang-disagg` +Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `slurm_multi` **2. Multi-Node Communication Fails** ```bash diff --git a/examples/k8s-configs/basic/sglang-disagg-custom-split.json b/examples/k8s-configs/basic/slurm-multi-custom-split.json similarity index 97% rename from examples/k8s-configs/basic/sglang-disagg-custom-split.json rename to examples/k8s-configs/basic/slurm-multi-custom-split.json index 49aeecb1..12fd2a4a 100644 --- a/examples/k8s-configs/basic/sglang-disagg-custom-split.json +++ b/examples/k8s-configs/basic/slurm-multi-custom-split.json @@ -26,7 +26,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "master_port": 29500, diff --git a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json similarity index 97% rename from examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json rename to examples/k8s-configs/basic/slurm-multi-multi-node-basic.json index c16fd342..94f51fc5 100644 --- a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json +++ b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json @@ -25,7 +25,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "master_port": 29500 diff --git a/examples/k8s-configs/minimal/sglang-disagg-minimal.json b/examples/k8s-configs/minimal/slurm-multi-minimal.json similarity index 92% rename from examples/k8s-configs/minimal/sglang-disagg-minimal.json rename to examples/k8s-configs/minimal/slurm-multi-minimal.json index f0f6ad05..1aab4386 100644 --- a/examples/k8s-configs/minimal/sglang-disagg-minimal.json +++ b/examples/k8s-configs/minimal/slurm-multi-minimal.json @@ -11,7 +11,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 3, "nproc_per_node": 1 } diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/slurm-multi-custom-split.json similarity index 98% rename from examples/slurm-configs/basic/sglang-disagg-custom-split.json rename to examples/slurm-configs/basic/slurm-multi-custom-split.json index 291a5938..0c7962aa 100644 --- a/examples/slurm-configs/basic/sglang-disagg-custom-split.json +++ b/examples/slurm-configs/basic/slurm-multi-custom-split.json @@ -25,7 +25,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "backend": "nccl", diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/slurm-multi-multi-node.json similarity index 98% rename from examples/slurm-configs/basic/sglang-disagg-multi-node.json rename to examples/slurm-configs/basic/slurm-multi-multi-node.json index 0c5ec00d..f6ea63e8 100644 --- a/examples/slurm-configs/basic/sglang-disagg-multi-node.json +++ b/examples/slurm-configs/basic/slurm-multi-multi-node.json @@ -24,7 +24,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "backend": "nccl", diff --git a/examples/slurm-configs/minimal/sglang-disagg-minimal.json b/examples/slurm-configs/minimal/slurm-multi-minimal.json similarity index 93% rename from examples/slurm-configs/minimal/sglang-disagg-minimal.json rename to examples/slurm-configs/minimal/slurm-multi-minimal.json index ee4ad9f2..95fb9761 100644 --- a/examples/slurm-configs/minimal/sglang-disagg-minimal.json +++ b/examples/slurm-configs/minimal/slurm-multi-minimal.json @@ -14,7 +14,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 3, "nproc_per_node": 1 } diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 9d5b6e9f..6cd15809 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -51,7 +51,13 @@ "megatron-lm", "vllm", "sglang", - "sglang-disagg" + "slurm_multi", +] + +# slurm_multi launcher name variants (underscore and hyphen) +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", ] @@ -959,20 +965,20 @@ def _prepare_template_context( model_script=model_info.get("scripts", "run.sh") ) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]: if nnodes < 3: raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes " + f"slurm_multi launcher requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) # Always create headless service for disaggregated architecture create_headless_service = True - self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") + self.console.print(f"[dim]slurm_multi: Creating headless service for {nnodes} pods[/dim]") self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") - # Generate SGLang Disaggregated launcher command - launcher_command = self._generate_sglang_disagg_command( + # Generate slurm_multi launcher command + launcher_command = self._generate_slurm_multi_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, @@ -1631,13 +1637,13 @@ def _generate_torchtitan_command( --tee=3 \\ {model_script}""" - def _generate_sglang_disagg_command( + def _generate_slurm_multi_command( self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str ) -> str: """ - Generate SGLang Disaggregated launcher command for K8s Indexed Jobs. + Generate slurm_multi launcher command for K8s Indexed Jobs. - SGLang Disaggregated uses separate node pools for: + slurm_multi uses separate node pools for: - Proxy (index 0): Load balancer and request router - Prefill (indices 1 to xP): Prompt processing - Decode (indices xP+1 to end): Token generation @@ -1656,7 +1662,7 @@ def _generate_sglang_disagg_command( model_script: Path to model launch script Returns: - Complete disaggregated launch setup + Complete multi-node launch setup Raises: ValueError: If nnodes < 3 or invalid parameters @@ -1664,7 +1670,7 @@ def _generate_sglang_disagg_command( # Validate if not isinstance(nnodes, int) or nnodes < 3: raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes, got {nnodes}" + f"slurm_multi requires minimum 3 nodes, got {nnodes}" ) if not isinstance(nproc_per_node, int) or nproc_per_node < 1: raise ValueError(f"nproc_per_node must be >= 1, got {nproc_per_node}") diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 73bd90df..8187b36f 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -33,7 +33,13 @@ "megatron-lm", "vllm", "sglang", - "sglang-disagg" + "slurm_multi", +] + +# slurm_multi launcher name variants (underscore and hyphen) +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", ] @@ -484,16 +490,18 @@ def prepare(self) -> bool: model_key = model_keys[0] model_info = self.manifest["built_models"][model_key] - # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) + # Check if this is a slurm_multi launcher (baremetal multi-node) # Priority: model_info.distributed.launcher > additional_context.distributed.launcher model_distributed = model_info.get("distributed", {}) launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun") launcher_normalized = launcher_type.lower().replace("_", "-") - if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: - # For disagg launchers, generate simple wrapper script + # Check against slurm_multi aliases (includes legacy sglang-disagg, vllm-disagg) + slurm_multi_aliases_normalized = [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES] + if launcher_normalized in slurm_multi_aliases_normalized: + # For slurm_multi launchers, generate simple wrapper script # that runs the model's .slurm script directly on baremetal - self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") + self.console.print(f"[cyan]Detected slurm_multi launcher: {launcher_type}[/cyan]") # Pass model_key as docker_image_name (for manifests, the key IS the built image name) return self._prepare_baremetal_script(model_info, docker_image_name=model_key) @@ -527,10 +535,11 @@ def prepare(self) -> bool: def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool: """ - Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). + Generate a simple wrapper script for baremetal/slurm_multi launchers. - These launchers run the model's .slurm script directly on baremetal, - which then manages Docker containers via srun. No madengine wrapper needed. + These launchers (slurm_multi, sglang-disagg, vllm-disagg) run the model's + .slurm script directly on baremetal, which then manages Docker containers + via srun. No madengine wrapper needed. Args: model_info: Model configuration from manifest @@ -610,7 +619,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.extend([ "", f"# Baremetal launcher script for {model_info['name']}", - f"# Generated by madengine for sglang-disagg", + f"# Generated by madengine for slurm_multi", "", "set -e", "", @@ -779,8 +788,8 @@ def _generate_launcher_command( return self._generate_vllm_command(nnodes, nproc_per_node, master_port) elif launcher_type == "sglang": return self._generate_sglang_command(nnodes, nproc_per_node, master_port) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": - return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port) + elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]: + return self._generate_slurm_multi_command(nnodes, nproc_per_node, master_port) elif launcher_type == "deepspeed": return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port) elif launcher_type == "megatron": @@ -877,13 +886,13 @@ def _generate_sglang_command( export SGLANG_PIPELINE_PARALLEL_SIZE={nnodes} # SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' - def _generate_sglang_disagg_command( + def _generate_slurm_multi_command( self, nnodes: int, nproc_per_node: int, master_port: int ) -> str: """ - Generate SGLang Disaggregated launcher environment for SLURM. + Generate slurm_multi launcher environment for SLURM. - SGLang Disaggregated Architecture: + slurm_multi Architecture (multi-node baremetal): - Node 0: Proxy (load balancer) - Nodes 1 to xP: Prefill nodes - Nodes xP+1 to xP+yD: Decode nodes @@ -899,11 +908,11 @@ def _generate_sglang_disagg_command( Environment setup with node role assignment Raises: - ValueError: If nnodes < 3 (minimum for disagg) + ValueError: If nnodes < 3 """ if nnodes < 3: raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes " + f"slurm_multi requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 1df9dd29..46bc8b42 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -17,12 +17,11 @@ # Launchers that should run directly on baremetal (not inside Docker) # These launchers manage their own Docker containers via SLURM srun commands -BAREMETAL_LAUNCHERS = [ - "sglang-disagg", - "sglang_disagg", - "vllm-disagg", - "vllm_disagg", +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", ] +BAREMETAL_LAUNCHERS = SLURM_MULTI_ALIASES from rich.console import Console as RichConsole from contextlib import redirect_stdout, redirect_stderr from madengine.core.console import Console @@ -603,7 +602,7 @@ def _run_on_baremetal( """ Run script directly on baremetal (not inside Docker). - Used for launchers like sglang-disagg that manage their own Docker containers + Used for slurm_multi launchers that manage their own Docker containers via SLURM srun commands. The script is executed directly on the node. Args: @@ -1039,7 +1038,7 @@ def run_container( print(f"Docker options: {docker_options}") # ========== CHECK FOR BAREMETAL LAUNCHERS ========== - # Launchers like sglang-disagg run scripts directly on baremetal, + # slurm_multi launchers run scripts directly on baremetal, # not inside Docker. The script itself manages Docker containers via srun. launcher = "" From d7197e92072e035b9d3ad5e84fd221220863bef2 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 4 Mar 2026 21:46:14 +0000 Subject: [PATCH 05/12] Add documentation for --use-image, --build-on-compute, and SLURM features docs/cli-reference.md: - Add --use-image and --build-on-compute options to build command - Add examples for pre-built image and compute node build workflows - Document when to use each mode docs/usage.md: - Add "Pre-built Image Mode" section with examples - Add "Build on Compute Node" section explaining workflow docs/deployment.md: - Add "SLURM Allocation Detection" section - Add "Baremetal Execution (slurm_multi)" section - Document environment variables and behavior inside salloc docs/launchers.md: - Fix example file paths (slurm_multi -> slurm-multi) Made-with: Cursor --- docs/cli-reference.md | 78 +++++++++++++++++++++++++++++++++++++++++++ docs/deployment.md | 70 ++++++++++++++++++++++++++++++++++++++ docs/launchers.md | 12 +++---- docs/usage.md | 54 ++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 6 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 5d58f1e6..720f12ef 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -98,6 +98,8 @@ madengine build [OPTIONS] | `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) | | `--registry` | `-r` | TEXT | `None` | Docker registry to push images to | | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | +| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image instead | +| `--build-on-compute` | | FLAG | `False` | Build Docker images on SLURM compute node instead of login node | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache | @@ -142,6 +144,16 @@ madengine build --tags model \ # Real-time output with verbose logging madengine build --tags model --live-output --verbose + +# Use pre-built image (skip Docker build) +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file slurm-config.json + +# Build on SLURM compute node instead of login node +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json ``` **Required Context for Build:** @@ -170,6 +182,72 @@ When using `--batch-manifest`, provide a JSON file with selective build configur See [Batch Build Guide](batch-build.md) for details. +**Pre-built Image Mode (`--use-image`):** + +Skip Docker build and use an existing image from a registry or local Docker cache: + +```bash +# Use image from Docker Hub +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file config.json + +# Use image from NGC +madengine build --tags model \ + --use-image nvcr.io/nvidia/pytorch:24.01-py3 + +# Use locally cached image +madengine build --tags model \ + --use-image my-local-image:latest +``` + +**When to use `--use-image`:** +- Using official framework images (SGLang, vLLM, etc.) +- Image is pre-cached on compute nodes +- Testing without rebuilding +- CI/CD pipelines with external images + +The generated manifest marks the image as `"prebuilt": true` with `build_time: 0`. + +**Build on Compute Node (`--build-on-compute`):** + +Build Docker images on SLURM compute nodes instead of the login node: + +```bash +# Build on compute node (requires SLURM config) +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json +``` + +**slurm-config.json for build-on-compute:** +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "time": "02:00:00", + "reservation": "my-reservation" + } +} +``` + +**When to use `--build-on-compute`:** +- Login node has limited disk space or resources +- Build requires GPU access (e.g., AOT compilation) +- Need image available on all compute nodes simultaneously +- Login node policies prohibit heavy workloads + +**How it works:** +1. Generates `madengine_build_job.sh` sbatch script +2. Submits via `sbatch --wait` +3. Runs `docker build` on ALL allocated nodes in parallel via `srun` +4. Generates manifest on completion + +**Inside existing SLURM allocation:** + +If you're already inside an `salloc` allocation, `--build-on-compute` uses `srun` directly instead of submitting a new job. + --- ### `run` - Execute Models diff --git a/docs/deployment.md b/docs/deployment.md index 794fe3aa..3be0371c 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -258,6 +258,39 @@ SLURM automatically provides: - Network interface configuration - Rank assignment via `$SLURM_PROCID` +### SLURM Allocation Detection + +madengine automatically detects if you're running inside an existing SLURM allocation (via `salloc`): + +```bash +# Allocate nodes interactively +salloc -N 3 -p gpu --gpus-per-node=8 -t 04:00:00 + +# madengine detects the allocation automatically +madengine run --manifest-file build_manifest.json +# Output: ✓ Detected existing SLURM allocation: Job 12345 +# Allocation has 3 nodes available +``` + +**Behavior inside allocation:** +- Uses `srun` directly instead of `sbatch` +- Validates requested nodes ≤ available nodes +- Warns if using fewer nodes than allocated +- Skips job submission (already allocated) + +**Build inside allocation:** + +```bash +# Inside salloc session +madengine build --tags model --build-on-compute +# Uses srun instead of sbatch --wait +``` + +**Environment variables detected:** +- `SLURM_JOB_ID` - Indicates inside allocation +- `SLURM_NNODES` - Number of nodes available +- `SLURM_NODELIST` - List of allocated nodes + ### Monitoring ```bash @@ -372,6 +405,43 @@ scancel -u $USER } ``` +### Baremetal Execution (slurm_multi) + +For disaggregated inference workloads like SGLang Disaggregated, madengine supports baremetal execution where the model's `.slurm` script manages Docker containers directly: + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "gpus_per_node": 8, + "time": "04:00:00" + }, + "distributed": { + "launcher": "slurm_multi", + "nnodes": 3, + "nproc_per_node": 8, + "sglang_disagg": { + "prefill_nodes": 1, + "decode_nodes": 1 + } + } +} +``` + +**How baremetal execution works:** +1. madengine generates a wrapper script (not a Docker container) +2. The wrapper runs the model's `.slurm` script directly on baremetal +3. The `.slurm` script manages Docker containers via `srun` +4. Environment variables from `models.json` and `additional-context` are passed through + +**When to use `slurm_multi`:** +- SGLang Disaggregated inference (proxy + prefill + decode nodes) +- Workloads requiring direct SLURM node control +- Custom Docker orchestration via `.slurm` scripts + +See [Launchers Guide](launchers.md#7-sglang-disaggregated-new) for detailed configuration. + ## Troubleshooting ### Kubernetes Issues diff --git a/docs/launchers.md b/docs/launchers.md index 58e725ae..b27aca4f 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -457,12 +457,12 @@ SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..." ``` **Examples**: -- K8s Minimal: `examples/k8s-configs/minimal/slurm_multi-minimal.json` -- K8s Basic: `examples/k8s-configs/basic/slurm_multi-multi-node-basic.json` -- K8s Custom: `examples/k8s-configs/basic/slurm_multi-custom-split.json` -- SLURM Minimal: `examples/slurm-configs/minimal/slurm_multi-minimal.json` -- SLURM Basic: `examples/slurm-configs/basic/slurm_multi-multi-node.json` -- SLURM Custom: `examples/slurm-configs/basic/slurm_multi-custom-split.json` +- K8s Minimal: `examples/k8s-configs/minimal/slurm-multi-minimal.json` +- K8s Basic: `examples/k8s-configs/basic/slurm-multi-multi-node-basic.json` +- K8s Custom: `examples/k8s-configs/basic/slurm-multi-custom-split.json` +- SLURM Minimal: `examples/slurm-configs/minimal/slurm-multi-minimal.json` +- SLURM Basic: `examples/slurm-configs/basic/slurm-multi-multi-node.json` +- SLURM Custom: `examples/slurm-configs/basic/slurm-multi-custom-split.json` **Comparison: SGLang vs SGLang Disaggregated**: diff --git a/docs/usage.md b/docs/usage.md index 89ebd415..edbda0fc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -273,6 +273,60 @@ madengine build --batch-manifest batch.json \ ] ``` +### Pre-built Image Mode + +Skip Docker build entirely and use an existing image: + +```bash +# Use external image (e.g., from Docker Hub) +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file slurm-config.json + +# Then run normally +madengine run --manifest-file build_manifest.json +``` + +**Use cases:** +- Official framework images (SGLang, vLLM, PyTorch NGC) +- Pre-cached images on compute nodes +- Quick testing without rebuild time +- CI/CD with external registries + +The manifest marks the image as `"prebuilt": true` with zero build time. + +### Build on Compute Node + +For SLURM environments where login nodes have limited resources: + +```bash +# Build on compute nodes instead of login node +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json +``` + +**slurm-config.json:** +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "time": "02:00:00" + } +} +``` + +**How it works:** +1. Submits `madengine_build_job.sh` via `sbatch --wait` +2. Builds Docker image on ALL allocated nodes in parallel +3. Generates manifest after completion + +**Benefits:** +- Offloads heavy build to compute resources +- Image available on all nodes simultaneously +- Respects login node resource policies + ## Run Workflow ### Local Execution From 22e2b35411f057042b38d8bf00f24788ac1933c5 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 04:32:44 +0000 Subject: [PATCH 06/12] Enhance --use-image and --build-on-compute options --use-image changes: - Make image name optional (auto-detect from model card's DOCKER_IMAGE_NAME) - Add mutual exclusivity with --registry and --build-on-compute - Handle multiple models with different images (use first, warn) --build-on-compute changes: - Require --registry option (build once, push, pull everywhere) - Build on 1 compute node only, push to registry - Merge SLURM config from model card + additional-context (CLI overrides) - Add registry credential validation before build - Add docker login step in build script - Smart registry image naming (namespace/repo vs namespace formats) - Parallel docker pull on all nodes in run phase - Clear error messages for missing partition, credentials Run phase changes: - Detect built_on_compute flag in manifest - Pull image in parallel on all nodes via srun before model execution Documentation updated for new workflows. Made-with: Cursor --- docs/cli-reference.md | 73 ++- docs/usage.md | 52 +- src/madengine/cli/commands/build.py | 29 +- src/madengine/deployment/slurm.py | 43 +- .../orchestration/build_orchestrator.py | 560 +++++++++++++----- 5 files changed, 572 insertions(+), 185 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 720f12ef..0b922f6e 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -98,7 +98,7 @@ madengine build [OPTIONS] | `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) | | `--registry` | `-r` | TEXT | `None` | Docker registry to push images to | | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | -| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image instead | +| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image. Omit value to auto-detect from model's `DOCKER_IMAGE_NAME` | | `--build-on-compute` | | FLAG | `False` | Build Docker images on SLURM compute node instead of login node | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | @@ -187,7 +187,12 @@ See [Batch Build Guide](batch-build.md) for details. Skip Docker build and use an existing image from a registry or local Docker cache: ```bash -# Use image from Docker Hub +# Auto-detect image from model card's DOCKER_IMAGE_NAME env var +madengine build --tags sglang_disagg \ + --use-image \ + --additional-context-file config.json + +# Explicitly specify image from Docker Hub madengine build --tags sglang_disagg \ --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ --additional-context-file config.json @@ -201,6 +206,18 @@ madengine build --tags model \ --use-image my-local-image:latest ``` +**Image Resolution Priority:** +1. If `--use-image ` is specified, use that image +2. If `--use-image` (no value), auto-detect from model card's `DOCKER_IMAGE_NAME` env var +3. If no image found in model card, error with helpful suggestions + +**Multiple Models Warning:** +When using auto-detection with multiple models that have different `DOCKER_IMAGE_NAME` values, the first model's image is used and a warning is printed. + +**Mutual Exclusivity:** +- `--use-image` cannot be used with `--registry` (push requires local build) +- `--use-image` cannot be used with `--build-on-compute` (skip build vs. build on compute) + **When to use `--use-image`:** - Using official framework images (SGLang, vLLM, etc.) - Image is pre-cached on compute nodes @@ -211,43 +228,61 @@ The generated manifest marks the image as `"prebuilt": true` with `build_time: 0 **Build on Compute Node (`--build-on-compute`):** -Build Docker images on SLURM compute nodes instead of the login node: +Build Docker images on a SLURM compute node, push to registry, and pull in parallel during run phase: ```bash -# Build on compute node (requires SLURM config) +# Build on compute node and push to registry (--registry REQUIRED) madengine build --tags model \ --build-on-compute \ + --registry docker.io/myorg \ --additional-context-file slurm-config.json ``` -**slurm-config.json for build-on-compute:** -```json -{ - "slurm": { - "partition": "gpu", - "nodes": 3, - "time": "02:00:00", - "reservation": "my-reservation" - } -} +**Required:** `--registry` must be specified with `--build-on-compute`. + +**SLURM Config Priority:** +1. Model card's `slurm` section (base configuration) +2. `--additional-context` overrides (command line takes precedence) + +If the model card already has `slurm` config, you only need to provide missing or override values: + +```bash +# Model card has partition/time, just override reservation +madengine build --tags model \ + --build-on-compute \ + --registry docker.io/myorg \ + --additional-context '{"slurm": {"reservation": "my-res"}}' ``` **When to use `--build-on-compute`:** - Login node has limited disk space or resources - Build requires GPU access (e.g., AOT compilation) -- Need image available on all compute nodes simultaneously - Login node policies prohibit heavy workloads +- Distributing images to many compute nodes (build once, pull everywhere) **How it works:** -1. Generates `madengine_build_job.sh` sbatch script -2. Submits via `sbatch --wait` -3. Runs `docker build` on ALL allocated nodes in parallel via `srun` -4. Generates manifest on completion + +*Build Phase:* +1. Discovers model and merges SLURM config (model card + additional-context) +2. Submits build job to **1 compute node** via `sbatch --wait` +3. Builds Docker image on that node +4. Pushes image to registry +5. Generates manifest with registry image name + +*Run Phase:* +1. Detects `built_on_compute: true` in manifest +2. Pulls image **in parallel on ALL nodes** via `srun docker pull` +3. Executes model script **Inside existing SLURM allocation:** If you're already inside an `salloc` allocation, `--build-on-compute` uses `srun` directly instead of submitting a new job. +**Error Messages:** + +If required SLURM fields are missing, specific errors are shown: +- Missing `partition`: "Add partition to model card's slurm section or via --additional-context" + --- ### `run` - Execute Models diff --git a/docs/usage.md b/docs/usage.md index edbda0fc..7008d0ed 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -278,7 +278,12 @@ madengine build --batch-manifest batch.json \ Skip Docker build entirely and use an existing image: ```bash -# Use external image (e.g., from Docker Hub) +# Auto-detect image from model card's DOCKER_IMAGE_NAME env var +madengine build --tags sglang_disagg \ + --use-image \ + --additional-context-file slurm-config.json + +# Or explicitly specify the image madengine build --tags sglang_disagg \ --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ --additional-context-file slurm-config.json @@ -287,6 +292,15 @@ madengine build --tags sglang_disagg \ madengine run --manifest-file build_manifest.json ``` +**Image Resolution:** +1. If `--use-image ` provided → use that image +2. If `--use-image` (no value) → auto-detect from model card's `DOCKER_IMAGE_NAME` +3. If no image found → error with helpful message + +**Mutual Exclusivity:** +- Cannot use with `--registry` (push requires local build) +- Cannot use with `--build-on-compute` (skip vs. build) + **Use cases:** - Official framework images (SGLang, vLLM, PyTorch NGC) - Pre-cached images on compute nodes @@ -300,32 +314,36 @@ The manifest marks the image as `"prebuilt": true` with zero build time. For SLURM environments where login nodes have limited resources: ```bash -# Build on compute nodes instead of login node +# Build on 1 compute node, push to registry, pull in parallel at runtime madengine build --tags model \ --build-on-compute \ - --additional-context-file slurm-config.json + --registry docker.io/myorg \ + --additional-context '{"slurm": {"reservation": "my-res"}}' ``` -**slurm-config.json:** -```json -{ - "slurm": { - "partition": "gpu", - "nodes": 3, - "time": "02:00:00" - } -} -``` +**Required:** `--registry` must be specified. + +**SLURM Config Merging:** +- Model card's `slurm` section provides base configuration +- `--additional-context` overrides specific fields +- Only specify what's missing or needs override **How it works:** -1. Submits `madengine_build_job.sh` via `sbatch --wait` -2. Builds Docker image on ALL allocated nodes in parallel -3. Generates manifest after completion + +*Build Phase:* +1. Builds Docker image on **1 compute node** +2. Pushes image to registry +3. Stores registry image name in manifest + +*Run Phase:* +1. Pulls image **in parallel on ALL nodes** via `srun docker pull` +2. Executes model script **Benefits:** - Offloads heavy build to compute resources -- Image available on all nodes simultaneously +- Build once, distribute via registry pull - Respects login node resource policies +- Parallel pull scales to many nodes ## Run Workflow diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index e7e93a3c..091bf843 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -59,7 +59,9 @@ def build( Optional[str], typer.Option( "--use-image", - help="Skip Docker build and use pre-built image (e.g., lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x)" + is_flag=False, + flag_value="auto", + help="Skip Docker build and use pre-built image. Optionally specify image name, or omit to auto-detect from model card's DOCKER_IMAGE_NAME" ), ] = None, build_on_compute: Annotated[ @@ -124,6 +126,31 @@ def build( ) raise typer.Exit(ExitCode.INVALID_ARGS) + if use_image and registry: + console.print( + "❌ [bold red]Error: Cannot specify both --use-image and --registry options[/bold red]\n" + "[yellow]Use --use-image for pre-built external images.[/yellow]\n" + "[yellow]Use --registry to push locally built images.[/yellow]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + if use_image and build_on_compute: + console.print( + "❌ [bold red]Error: Cannot specify both --use-image and --build-on-compute options[/bold red]\n" + "[yellow]--use-image skips Docker build entirely.[/yellow]\n" + "[yellow]--build-on-compute builds on SLURM compute nodes.[/yellow]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + if build_on_compute and not registry: + console.print( + "❌ [bold red]Error: --build-on-compute requires --registry option[/bold red]\n" + "[yellow]Build on compute node pushes image to registry.[/yellow]\n" + "[yellow]Run phase will pull image in parallel on all nodes.[/yellow]\n" + "[dim]Example: --build-on-compute --registry docker.io/myorg[/dim]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + # Process batch manifest if provided batch_data = None effective_tags = processed_tags diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 8187b36f..dc891f81 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -632,7 +632,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.append("") script_lines.extend([ "echo '=========================================='", - "echo 'Baremetal Launcher - SGLang Disaggregated'", + "echo 'Baremetal Launcher - slurm_multi'", "echo '=========================================='", f"echo 'Model: {model_info['name']}'", f"echo 'Script: {model_script_path}'", @@ -640,6 +640,47 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo 'SLURM_NNODES:' $SLURM_NNODES", "echo 'SLURM_NODELIST:' $SLURM_NODELIST", "echo ''", + ]) + + # Check if image was built on compute and needs parallel pull + built_on_compute = model_info.get("built_on_compute", False) + docker_image = env_vars.get("DOCKER_IMAGE_NAME", "") + + if built_on_compute and docker_image: + # Add parallel docker pull on all nodes + script_lines.extend([ + "", + "# Pull Docker image in parallel on all nodes", + "echo '=========================================='", + "echo 'Pulling Docker image on all nodes in parallel'", + "echo '=========================================='", + f"echo 'Image: {docker_image}'", + "echo ''", + "", + f"srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c \"", + f" echo \\\"[\\$(hostname)] Pulling {docker_image}...\\\"", + f" docker pull {docker_image}", + " PULL_RC=\\$?", + " if [ \\$PULL_RC -eq 0 ]; then", + " echo \\\"[\\$(hostname)] Pull SUCCESS\\\"", + " else", + " echo \\\"[\\$(hostname)] Pull FAILED with exit code \\$PULL_RC\\\"", + " fi", + " exit \\$PULL_RC", + "\"", + "PULL_EXIT=$?", + "", + "if [ $PULL_EXIT -ne 0 ]; then", + " echo 'Docker pull failed on one or more nodes'", + " exit $PULL_EXIT", + "fi", + "", + "echo ''", + "echo 'Docker image pulled on all nodes'", + "echo ''", + ]) + + script_lines.extend([ "", "# Change to script directory", f"cd {model_script_path.parent}", diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 76830240..cfaccb43 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -201,6 +201,10 @@ def execute( """ # Handle pre-built image mode if use_image: + # If use_image is "auto", resolve from model card + if use_image == "auto": + use_image = self._resolve_image_from_model_card() + return self._execute_with_prebuilt_image( use_image=use_image, manifest_output=manifest_output, @@ -582,6 +586,86 @@ def _execute_with_prebuilt_image( ), ) from e + def _resolve_image_from_model_card(self) -> str: + """ + Resolve Docker image name from model card's DOCKER_IMAGE_NAME env var. + + This method discovers models and extracts the DOCKER_IMAGE_NAME from + env_vars. If multiple models have different images, uses the first + and prints a warning. + + Returns: + Docker image name from model card + + Raises: + ConfigurationError: If no DOCKER_IMAGE_NAME found in any model + """ + self.rich_console.print("[bold cyan]🔍 Auto-detecting image from model card...[/bold cyan]") + + # Discover models to get their env_vars + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise ConfigurationError( + "No models discovered for image auto-detection", + context=create_error_context( + operation="resolve_image", + component="BuildOrchestrator", + ), + suggestions=[ + "Specify image name explicitly with --use-image ", + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + # Collect DOCKER_IMAGE_NAME from all models + images_found = {} + for model in models: + model_name = model.get("name", "unknown") + env_vars = model.get("env_vars", {}) + docker_image = env_vars.get("DOCKER_IMAGE_NAME") + + if docker_image: + images_found[model_name] = docker_image + + if not images_found: + model_names = [m.get("name", "unknown") for m in models] + raise ConfigurationError( + "No DOCKER_IMAGE_NAME found in model card env_vars", + context=create_error_context( + operation="resolve_image", + component="BuildOrchestrator", + model_names=model_names, + ), + suggestions=[ + "Add DOCKER_IMAGE_NAME to model's env_vars in models.json", + "Specify image name explicitly with --use-image ", + 'Example: "env_vars": {"DOCKER_IMAGE_NAME": "myimage:tag"}', + ], + ) + + # Use first model's image + first_model = list(images_found.keys())[0] + resolved_image = images_found[first_model] + + # Warn if multiple models have different images + unique_images = set(images_found.values()) + if len(unique_images) > 1: + self.rich_console.print( + f"[yellow]⚠️ Warning: Multiple models have different DOCKER_IMAGE_NAME values:[/yellow]" + ) + for model_name, image in images_found.items(): + self.rich_console.print(f" - {model_name}: {image}") + self.rich_console.print( + f"[yellow] Using image from '{first_model}': {resolved_image}[/yellow]\n" + ) + else: + self.rich_console.print(f"[green]✓ Auto-detected image: {resolved_image}[/green]\n") + + return resolved_image + def _execute_build_on_compute( self, registry: Optional[str] = None, @@ -590,16 +674,16 @@ def _execute_build_on_compute( batch_build_metadata: Optional[Dict] = None, ) -> str: """ - Execute Docker build on a SLURM compute node instead of login node. + Execute Docker build on a SLURM compute node and push to registry. - This submits a SLURM job that runs the Docker build on a compute node, - which is useful when: - - Login node has limited disk space - - Login node shouldn't run heavy workloads - - Compute nodes have faster storage/network + Build workflow: + 1. Build on 1 compute node only + 2. Push image to registry + 3. Store registry image name in manifest + 4. Run phase will pull image in parallel on all nodes Args: - registry: Optional registry to push images to + registry: Registry to push images to (REQUIRED) clean_cache: Whether to use --no-cache for Docker builds manifest_output: Output file for build manifest batch_build_metadata: Optional batch build metadata @@ -609,189 +693,316 @@ def _execute_build_on_compute( """ import subprocess import os + import glob self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE (Compute Node Mode)[/bold blue]") - self.rich_console.print("[cyan]Building on SLURM compute node...[/cyan]") + self.rich_console.print("[cyan]Building on 1 compute node, pushing to registry...[/cyan]") self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + # Discover models first to get SLURM config from model card + self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]") + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered for build-on-compute", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + model = models[0] + model_name = model.get("name", "unknown") + self.rich_console.print(f"[green]✓ Found model: {model_name}[/green]\n") + + # Merge SLURM config: model card (base) + additional-context (override) + model_slurm_config = model.get("slurm", {}) + context_slurm_config = self.additional_context.get("slurm", {}) + + # Start with model card config, then override with command-line context + slurm_config = {**model_slurm_config, **context_slurm_config} + + self.rich_console.print("[bold cyan]📋 SLURM Configuration (merged):[/bold cyan]") + if model_slurm_config: + self.rich_console.print(f" [dim]From model card:[/dim] {list(model_slurm_config.keys())}") + if context_slurm_config: + self.rich_console.print(f" [dim]From --additional-context (overrides):[/dim] {list(context_slurm_config.keys())}") + + # Validate required fields + partition = slurm_config.get("partition") + if not partition: + raise ConfigurationError( + "Missing required SLURM field: partition", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + 'Add "partition" to model card\'s slurm section', + 'Or specify via --additional-context \'{"slurm": {"partition": "gpu"}}\'', + ], + ) + + reservation = slurm_config.get("reservation", "") + time_limit = slurm_config.get("time", "02:00:00") + + self.rich_console.print(f" Partition: {partition}") + self.rich_console.print(f" Time limit: {time_limit}") + if reservation: + self.rich_console.print(f" Reservation: {reservation}") + self.rich_console.print("") + + # Validate registry credentials + self.rich_console.print("[bold cyan]🔐 Registry Configuration:[/bold cyan]") + self.rich_console.print(f" Registry: {registry}") + + # Check for credentials - either from environment or credential.json + dockerhub_user = os.environ.get("MAD_DOCKERHUB_USER", "") + dockerhub_password = os.environ.get("MAD_DOCKERHUB_PASSWORD", "") + + # Try to load from credential.json if env vars not set + credential_file = Path("credential.json") + if not dockerhub_user and credential_file.exists(): + try: + with open(credential_file) as f: + creds = json.load(f) + dockerhub_creds = creds.get("dockerhub", {}) + dockerhub_user = dockerhub_creds.get("username", "") + dockerhub_password = dockerhub_creds.get("password", "") + if dockerhub_user: + self.rich_console.print(f" Credentials: Found in credential.json") + except (json.JSONDecodeError, IOError) as e: + self.rich_console.print(f" [yellow]Warning: Could not read credential.json: {e}[/yellow]") + elif dockerhub_user: + self.rich_console.print(f" Credentials: Found in environment (MAD_DOCKERHUB_USER)") + + # Determine if registry requires authentication + requires_auth = True + public_registries = ["docker.io", "ghcr.io", "gcr.io", "quay.io", "nvcr.io"] + registry_lower = registry.lower() if registry else "" + + # For docker.io pushes, authentication is always required + if any(pub_reg in registry_lower for pub_reg in public_registries): + if not dockerhub_user or not dockerhub_password: + raise ConfigurationError( + f"Registry credentials required for pushing to {registry}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + registry=registry, + ), + suggestions=[ + "Set environment variables: MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD", + 'Or create credential.json: {"dockerhub": {"username": "...", "password": "..."}}', + "For Docker Hub, use a Personal Access Token (PAT) as password", + f"Example: export MAD_DOCKERHUB_USER=myuser", + f"Example: export MAD_DOCKERHUB_PASSWORD=dckr_pat_xxxxx", + ], + ) + self.rich_console.print(f" Auth: Will login to registry before push") + else: + # Private/internal registry - may not need auth + self.rich_console.print(f" Auth: Private registry (auth may not be required)") + requires_auth = dockerhub_user and dockerhub_password + + self.rich_console.print("") + # Check if we're inside an existing allocation inside_allocation = os.environ.get("SLURM_JOB_ID") is not None existing_job_id = os.environ.get("SLURM_JOB_ID", "") - - # Get SLURM config from additional_context - slurm_config = self.additional_context.get("slurm", {}) - partition = slurm_config.get("partition", "gpu") - reservation = slurm_config.get("reservation", "") - time_limit = slurm_config.get("time", "02:00:00") - # Get number of nodes - build on ALL nodes so image is available everywhere - nodes = slurm_config.get("nodes", 1) - - # Build the madengine build command (without --build-on-compute to avoid recursion) - tags = getattr(self.args, 'tags', []) - tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" - # Write additional context to a file to avoid shell quoting issues - context_file_path = None - additional_context_str = "" - if self.additional_context: - import json - context_file_path = Path("madengine_build_context.json") - with open(context_file_path, 'w') as f: - json.dump(self.additional_context, f) - self.rich_console.print(f" Context file: {context_file_path}") - - # Base build command - build_cmd_parts = ["madengine", "build"] - if tags_str: - build_cmd_parts.extend(tags_str.split()) - if context_file_path: - build_cmd_parts.extend(["--additional-context-file", str(context_file_path)]) - build_cmd_parts.extend(["--manifest-output", manifest_output]) - if registry: - build_cmd_parts.extend(["--registry", registry]) - if clean_cache: - build_cmd_parts.append("--clean-docker-cache") - - build_cmd = " ".join(build_cmd_parts) - - if inside_allocation: - # Run build on compute node via srun - self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") - cmd = ["srun", "-N1", "--ntasks=1", "bash", "-c", build_cmd] + # Find Dockerfile + dockerfile = model.get("dockerfile", "") + dockerfile_path = "" + dockerfile_patterns = [ + f"{dockerfile}.ubuntu.amd.Dockerfile", + f"{dockerfile}.Dockerfile", + f"{dockerfile}", + ] + for pattern in dockerfile_patterns: + matches = glob.glob(pattern) + if matches: + dockerfile_path = matches[0] + break + + if not dockerfile_path: + raise ConfigurationError( + f"Dockerfile not found for model {model_name}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + dockerfile=dockerfile, + ), + suggestions=[ + f"Check if {dockerfile}.ubuntu.amd.Dockerfile exists", + "Verify the dockerfile path in models.json", + ], + ) + + # Generate image name for registry + dockerfile_basename = Path(dockerfile_path).name.replace(".Dockerfile", "").replace(".ubuntu.amd", "") + local_image_name = f"ci-{model_name}_{dockerfile_basename}" + + # Determine registry image name based on registry format + # docker.io/namespace/repo -> use model name as tag: docker.io/namespace/repo:model_name + # docker.io/namespace -> use model name as repo: docker.io/namespace/model_name:latest + registry_parts = registry.replace("docker.io/", "").split("/") + if len(registry_parts) >= 2: + # Registry already includes repo name (e.g., rocm/pytorch-private) + # Use model name as tag + registry_image_name = f"{registry}:{model_name}" + self.rich_console.print(f" [dim]Registry format: namespace/repo -> using model name as tag[/dim]") else: - # Generate and submit build script - self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") - - # Get absolute path for context file - abs_context_file = str(context_file_path.absolute()) if context_file_path else "" - abs_manifest_output = str(Path(manifest_output).absolute()) - - # Rebuild command with absolute paths for sbatch - build_cmd_abs = f"madengine build {tags_str}" - if abs_context_file: - build_cmd_abs += f" --additional-context-file {abs_context_file}" - build_cmd_abs += f" --manifest-output {abs_manifest_output}" - if registry: - build_cmd_abs += f" --registry {registry}" - if clean_cache: - build_cmd_abs += " --clean-docker-cache" - - # Discover models to get Dockerfile path - discover_models = DiscoverModels(args=self.args) - models = discover_models.run() - dockerfile_path = "" - dockerfile_name = "" - if models: - dockerfile = models[0].get("dockerfile", "") - # Find the actual Dockerfile - import glob - dockerfile_patterns = [ - f"{dockerfile}.ubuntu.amd.Dockerfile", - f"{dockerfile}.Dockerfile", - f"{dockerfile}", - ] - for pattern in dockerfile_patterns: - matches = glob.glob(pattern) - if matches: - dockerfile_path = matches[0] - dockerfile_name = Path(dockerfile_path).name - break - - self.rich_console.print(f" Nodes: {nodes} (building on all nodes)") - if dockerfile_path: - self.rich_console.print(f" Dockerfile: {dockerfile_path}") - - build_script_content = f"""#!/bin/bash + # Registry is just namespace (e.g., myuser) + # Use model name as repo + registry_image_name = f"{registry}/{model_name}:latest" + self.rich_console.print(f" [dim]Registry format: namespace -> using model name as repo[/dim]") + + self.rich_console.print("[bold cyan]🐳 Docker Configuration:[/bold cyan]") + self.rich_console.print(f" Dockerfile: {dockerfile_path}") + self.rich_console.print(f" Local image: {local_image_name}") + self.rich_console.print(f" Registry image: {registry_image_name}") + self.rich_console.print("") + + # Determine registry host for docker login + registry_host = registry.split("/")[0] if "/" in registry else registry + + # Build script content - builds on 1 node, pushes to registry + build_script_content = f"""#!/bin/bash #SBATCH --job-name=madengine-build #SBATCH --partition={partition} -#SBATCH --nodes={nodes} -#SBATCH --ntasks={nodes} +#SBATCH --nodes=1 +#SBATCH --ntasks=1 #SBATCH --time={time_limit} {f'#SBATCH --reservation={reservation}' if reservation else ''} #SBATCH --output=madengine_build_%j.out #SBATCH --error=madengine_build_%j.err -echo "=== Building on compute nodes ===" +echo "============================================================" +echo "=== MADENGINE BUILD ON COMPUTE NODE ===" +echo "============================================================" +echo "" echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_NNODES" -echo "Node list: $SLURM_NODELIST" +echo "Build Node: $(hostname)" echo "Working directory: $(pwd)" +echo "Registry: {registry}" echo "" # Change to submission directory cd {Path.cwd().absolute()} -# Activate virtual environment if available -if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then - source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} - echo "Activated virtual environment" +# Step 0: Docker login for registry push +echo "=== Step 0: Docker Registry Authentication ===" +DOCKER_USER="${{MAD_DOCKERHUB_USER:-}}" +DOCKER_PASS="${{MAD_DOCKERHUB_PASSWORD:-}}" + +# Try credential.json if env vars not set +if [ -z "$DOCKER_USER" ] && [ -f "credential.json" ]; then + echo "Reading credentials from credential.json..." + DOCKER_USER=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('username', ''))" 2>/dev/null || echo "") + DOCKER_PASS=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('password', ''))" 2>/dev/null || echo "") fi -# Step 1: Build Docker image on ALL nodes in parallel -echo "" -echo "=== Building Docker image on all $SLURM_NNODES nodes ===" -DOCKERFILE="{dockerfile_path}" -if [ -n "$DOCKERFILE" ] && [ -f "$DOCKERFILE" ]; then - # Get the image name - must match exactly what madengine generates - # Format: ci-_ - IMAGE_NAME=$(basename $DOCKERFILE .Dockerfile) - FULL_IMAGE_NAME="ci-{models[0].get('name', 'model') if models else 'model'}_$IMAGE_NAME" - - echo "Dockerfile: $DOCKERFILE" - echo "Image name: $FULL_IMAGE_NAME" - - # Build on all nodes in parallel using srun - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c " - echo \\\"[\\$(hostname)] Building Docker image...\\\" - cd {Path.cwd().absolute()} - docker build --network=host -t $FULL_IMAGE_NAME --pull -f $DOCKERFILE ./docker - BUILD_RC=\\$? - if [ \\$BUILD_RC -eq 0 ]; then - echo \\\"[\\$(hostname)] Docker build SUCCESS\\\" - else - echo \\\"[\\$(hostname)] Docker build FAILED with exit code \\$BUILD_RC\\\" - fi - exit \\$BUILD_RC - " - DOCKER_BUILD_EXIT=$? - - if [ $DOCKER_BUILD_EXIT -ne 0 ]; then - echo "Docker build failed on one or more nodes" - exit $DOCKER_BUILD_EXIT +if [ -n "$DOCKER_USER" ] && [ -n "$DOCKER_PASS" ]; then + echo "Logging in to registry as $DOCKER_USER..." + echo "$DOCKER_PASS" | docker login {registry_host} -u "$DOCKER_USER" --password-stdin + LOGIN_RC=$? + if [ $LOGIN_RC -ne 0 ]; then + echo "" + echo "❌ Docker login FAILED with exit code $LOGIN_RC" + echo "" + echo "Troubleshooting:" + echo " - Verify MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD are correct" + echo " - For Docker Hub, use a Personal Access Token (PAT) not your password" + echo " - Check if the registry URL is correct: {registry_host}" + exit $LOGIN_RC fi + echo "✅ Docker login SUCCESS" +else + echo "No credentials found - assuming public registry or pre-authenticated" +fi +echo "" + +# Step 1: Build Docker image +echo "" +echo "=== Step 1: Building Docker image ===" +echo "Dockerfile: {dockerfile_path}" +echo "Local image name: {local_image_name}" +echo "" + +docker build --network=host -t {local_image_name} {"--no-cache" if clean_cache else ""} --pull -f {dockerfile_path} ./docker +BUILD_RC=$? + +if [ $BUILD_RC -ne 0 ]; then echo "" - echo "=== Docker image built on all nodes ===" + echo "❌ Docker build FAILED on $(hostname) with exit code $BUILD_RC" + exit $BUILD_RC fi -# Step 2: Run madengine build on rank 0 to generate manifest echo "" -echo "=== Generating build manifest ===" -echo "Build command: {build_cmd_abs}" +echo "✅ Docker build SUCCESS on $(hostname)" echo "" -{build_cmd_abs} -BUILD_EXIT=$? +# Step 2: Tag and push to registry +echo "=== Step 2: Pushing to registry ===" +echo "Tagging: {local_image_name} -> {registry_image_name}" +docker tag {local_image_name} {registry_image_name} + +echo "Pushing: {registry_image_name}" +docker push {registry_image_name} +PUSH_RC=$? + +if [ $PUSH_RC -ne 0 ]; then + echo "" + echo "❌ Docker push FAILED with exit code $PUSH_RC" + echo "" + echo "Troubleshooting:" + echo " - Check if you have push access to {registry}" + echo " - Verify credentials are correct (MAD_DOCKERHUB_USER, MAD_DOCKERHUB_PASSWORD)" + echo " - For Docker Hub, ensure the repository exists or you have create permissions" + exit $PUSH_RC +fi echo "" -echo "=== Build completed with exit code: $BUILD_EXIT ===" -exit $BUILD_EXIT +echo "============================================================" +echo "✅ BUILD AND PUSH COMPLETE" +echo "============================================================" +echo "" +echo "Build Node: $(hostname)" +echo "Registry Image: {registry_image_name}" +echo "" +echo "Run phase will pull this image in parallel on all nodes." +echo "============================================================" + +exit 0 """ - build_script_path = Path("madengine_build_job.sh") - build_script_path.write_text(build_script_content) - build_script_path.chmod(0o755) - - self.rich_console.print(f" Build script: {build_script_path}") + + build_script_path = Path("madengine_build_job.sh") + build_script_path.write_text(build_script_content) + build_script_path.chmod(0o755) + + if inside_allocation: + self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") + cmd = ["srun", "-N1", "--ntasks=1", "bash", str(build_script_path)] + else: + self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") cmd = ["sbatch", "--wait", str(build_script_path)] - - # Execute the build + + self.rich_console.print(f" Build script: {build_script_path}") self.rich_console.print(f" Command: {' '.join(cmd)}") self.rich_console.print("") try: result = subprocess.run( cmd, - capture_output=False, # Let output flow to console + capture_output=False, text=True, ) @@ -806,11 +1017,64 @@ def _execute_build_on_compute( "Check the build log files (madengine_build_*.out/err)", "Verify SLURM partition and reservation settings", "Ensure Docker is available on compute nodes", + "Verify registry credentials are configured", ], ) + # Generate manifest with registry image name + self.rich_console.print(f"\n[bold cyan]📄 Generating manifest...[/bold cyan]") + + manifest = { + "built_images": { + registry_image_name: { + "image_name": registry_image_name, + "docker_image": registry_image_name, + "local_image": local_image_name, + "dockerfile": dockerfile_path, + "build_time": 0, + "built_on_compute": True, + "registry": registry, + } + }, + "built_models": { + registry_image_name: { + "name": model_name, + "image": registry_image_name, + "docker_image": registry_image_name, + "dockerfile": dockerfile_path, + "scripts": model.get("scripts", ""), + "data": model.get("data", ""), + "n_gpus": model.get("n_gpus", "8"), + "tags": model.get("tags", []), + "slurm": slurm_config, + "distributed": model.get("distributed", {}), + "env_vars": model.get("env_vars", {}), + "built_on_compute": True, + } + }, + "context": self.context.ctx if hasattr(self.context, 'ctx') else {}, + "deployment_config": { + "slurm": slurm_config, + "distributed": model.get("distributed", {}), + }, + "credentials_required": [], + "summary": { + "successful_builds": [model_name], + "failed_builds": [], + "total_build_time": 0, + "successful_pushes": [registry_image_name], + "failed_pushes": [], + }, + } + + with open(manifest_output, "w") as f: + json.dump(manifest, f, indent=2) + self.rich_console.print(f"[green]✓ Build completed on compute node[/green]") + self.rich_console.print(f"[green]✓ Image pushed: {registry_image_name}[/green]") self.rich_console.print(f"[green]✓ Manifest: {manifest_output}[/green]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + return manifest_output except subprocess.TimeoutExpired: @@ -821,6 +1085,8 @@ def _execute_build_on_compute( component="BuildOrchestrator", ), ) + except (DiscoveryError, ConfigurationError, BuildError): + raise except Exception as e: raise BuildError( f"Failed to build on compute node: {e}", From 18f056ee27db0291f599cbb8643e9c74b1183baa Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 06:57:09 +0000 Subject: [PATCH 07/12] Enforce registry requirement for slurm_multi launcher - slurm_multi launcher now requires --registry or --use-image - Parallel docker pull on all nodes for any registry image (not just built_on_compute) - DOCKER_IMAGE_NAME added to manifest env_vars for all registry builds - Documentation updated for slurm_multi requirements Made-with: Cursor --- docs/cli-reference.md | 52 +++++++++++++++++++ src/madengine/deployment/slurm.py | 8 +-- src/madengine/execution/docker_builder.py | 11 ++++ .../orchestration/build_orchestrator.py | 35 ++++++++++++- 4 files changed, 101 insertions(+), 5 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 0b922f6e..fdb949e4 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -285,6 +285,58 @@ If required SLURM fields are missing, specific errors are shown: --- +**Multi-Node SLURM Launcher (`slurm_multi`):** + +Models using the `slurm_multi` launcher (for multi-node distributed inference) **require** either `--registry` or `--use-image`: + +```bash +# Option 1: Build and push to registry +madengine build --tags sglang_model \ + --registry docker.io/myorg \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Option 2: Use pre-built image from registry +madengine build --tags sglang_model \ + --use-image docker.io/myorg/sglang:latest + +# Option 3: Build on compute and push +madengine build --tags sglang_model \ + --build-on-compute \ + --registry docker.io/myorg \ + --additional-context-file config.json +``` + +**Why this requirement?** + +Multi-node SLURM jobs run on multiple compute nodes. Each node needs access to the Docker image: +- Local builds only exist on the login/build node +- Compute nodes cannot access locally built images +- Registry images enable parallel `docker pull` on all nodes + +**Parallel Image Pull:** + +During `madengine run`, images from a registry are automatically pulled in parallel on all allocated nodes: + +```bash +srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES docker pull +``` + +This ensures fast, consistent image availability across the cluster. + +**Re-using Images:** + +For subsequent runs with the same image, use `--use-image` to skip building: + +```bash +# First run: build and push +madengine build --tags model --registry docker.io/myorg + +# Subsequent runs: use pre-built image +madengine build --tags model --use-image docker.io/myorg/model:latest +``` + +--- + ### `run` - Execute Models Run models locally or deploy to Kubernetes/SLURM clusters. diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index dc891f81..e38fffaa 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -642,12 +642,14 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo ''", ]) - # Check if image was built on compute and needs parallel pull - built_on_compute = model_info.get("built_on_compute", False) + # Check if image needs parallel pull on all nodes + # Pull if: image is from registry (contains / or .) and not a local ci-* build docker_image = env_vars.get("DOCKER_IMAGE_NAME", "") + is_registry_image = docker_image and not docker_image.startswith("ci-") and ("/" in docker_image or "." in docker_image) - if built_on_compute and docker_image: + if is_registry_image: # Add parallel docker pull on all nodes + # This ensures all nodes have the image before running script_lines.extend([ "", "# Pull Docker image in parallel on all nodes", diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py index 3901d864..fe2d5ae5 100644 --- a/src/madengine/execution/docker_builder.py +++ b/src/madengine/execution/docker_builder.py @@ -416,6 +416,17 @@ def export_build_manifest( "registry" ) + # Update built_models with registry image name for parallel pull in slurm_multi + # Map local image to registry image for env_vars + for image_name, build_info in self.built_images.items(): + registry_image = build_info.get("registry_image") + if registry_image and image_name in self.built_models: + model_data = self.built_models[image_name] + if "env_vars" not in model_data: + model_data["env_vars"] = {} + # Set DOCKER_IMAGE_NAME to registry image for parallel pull + model_data["env_vars"]["DOCKER_IMAGE_NAME"] = registry_image + manifest = { "built_images": self.built_images, "built_models": self.built_models, diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index cfaccb43..f664e87e 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -218,6 +218,33 @@ def execute( manifest_output=manifest_output, batch_build_metadata=batch_build_metadata, ) + + # For normal build: check if slurm_multi launcher requires registry + # Discover models first to check launcher + discover_models = DiscoverModels(args=self.args) + discovered_models = discover_models.run() + + if discovered_models: + for model in discovered_models: + launcher = model.get("distributed", {}).get("launcher", "") + if launcher in ["slurm_multi", "slurm-multi"] and not registry: + model_name = model.get("name", "unknown") + raise ConfigurationError( + f"slurm_multi launcher requires --registry or --use-image", + context=create_error_context( + operation="build", + component="BuildOrchestrator", + model=model_name, + launcher=launcher, + ), + suggestions=[ + "Use --registry docker.io/myorg to push image (nodes will pull in parallel)", + "Use --use-image to use a pre-built image from registry", + "Use --build-on-compute --registry to build on compute and push", + "For subsequent runs with same image, use: --use-image", + ], + ) + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]") self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]") @@ -517,6 +544,10 @@ def _execute_with_prebuilt_image( model_name = model.get("name", "unknown") model_distributed = model.get("distributed", {}) + # Merge DOCKER_IMAGE_NAME into env_vars for parallel pull in run phase + model_env_vars = model.get("env_vars", {}).copy() + model_env_vars["DOCKER_IMAGE_NAME"] = use_image + # Use image name as key so slurm.py can find docker_image manifest["built_models"][use_image] = { "name": model_name, @@ -534,7 +565,7 @@ def _execute_with_prebuilt_image( "args": model.get("args", ""), "slurm": model.get("slurm", {}), "distributed": model_distributed, - "env_vars": model.get("env_vars", {}), + "env_vars": model_env_vars, "prebuilt": True, } manifest["summary"]["successful_builds"].append(model_name) @@ -1048,7 +1079,7 @@ def _execute_build_on_compute( "tags": model.get("tags", []), "slurm": slurm_config, "distributed": model.get("distributed", {}), - "env_vars": model.get("env_vars", {}), + "env_vars": {**model.get("env_vars", {}), "DOCKER_IMAGE_NAME": registry_image_name}, "built_on_compute": True, } }, From 3da3403df6f6a8a5e3daa045e246588c657b0c30 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 06:59:36 +0000 Subject: [PATCH 08/12] Add slurm_multi registry requirement to all docs Ensure consistent documentation across: - deployment.md - launchers.md - usage.md All now mention that slurm_multi requires --registry or --use-image. Made-with: Cursor --- docs/deployment.md | 17 +++++++++++++++++ docs/launchers.md | 12 ++++++++++++ docs/usage.md | 21 +++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/docs/deployment.md b/docs/deployment.md index 3be0371c..d5545f5f 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -440,6 +440,23 @@ For disaggregated inference workloads like SGLang Disaggregated, madengine suppo - Workloads requiring direct SLURM node control - Custom Docker orchestration via `.slurm` scripts +**Registry Requirement:** + +Models using `slurm_multi` **require** either `--registry` or `--use-image` during build: + +```bash +# Option 1: Build and push to registry +madengine build --tags model --registry docker.io/myorg + +# Option 2: Use pre-built image +madengine build --tags model --use-image + +# Option 3: Build on compute and push +madengine build --tags model --build-on-compute --registry docker.io/myorg +``` + +This ensures all compute nodes can pull the image in parallel during `madengine run`. + See [Launchers Guide](launchers.md#7-sglang-disaggregated-new) for detailed configuration. ## Troubleshooting diff --git a/docs/launchers.md b/docs/launchers.md index b27aca4f..99b130c7 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -436,6 +436,18 @@ Override automatic split based on workload characteristics: - Ray cluster coordination - No torchrun needed (manages own processes) +**Registry Requirement (SLURM)**: + +Models using `slurm_multi` launcher **require** `--registry` or `--use-image` during build: + +```bash +madengine build --tags model --registry docker.io/myorg +# OR +madengine build --tags model --use-image +``` + +This ensures all compute nodes can pull the image in parallel during `madengine run`. + **Environment Variables (K8s)**: ```bash POD_INDEX=${JOB_COMPLETION_INDEX} # Pod index for role assignment diff --git a/docs/usage.md b/docs/usage.md index 7008d0ed..bdee5cb3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -345,6 +345,27 @@ madengine build --tags model \ - Respects login node resource policies - Parallel pull scales to many nodes +### Multi-Node SLURM (slurm_multi) + +Models using the `slurm_multi` launcher **require** either `--registry` or `--use-image`: + +```bash +# Option 1: Build and push +madengine build --tags sglang_model --registry docker.io/myorg + +# Option 2: Use pre-built image +madengine build --tags sglang_model --use-image + +# Option 3: Build on compute +madengine build --tags sglang_model --build-on-compute --registry docker.io/myorg +``` + +**Why?** Multi-node jobs run on multiple compute nodes. Each node needs the Docker image, and local builds only exist on the login node. + +**Parallel Pull:** During `madengine run`, registry images are automatically pulled in parallel on all nodes before execution. + +**Re-using images:** For subsequent runs with the same image, use `--use-image` to skip building. + ## Run Workflow ### Local Execution From 00292c4fb7d4bd184bffd790a7041f86d97b28a0 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Mon, 9 Mar 2026 04:38:39 +0000 Subject: [PATCH 09/12] Fix build validation, manifest merging, and SLURM monitoring robustness - Make gpu_vendor/guest_os optional when using --use-image (pre-built image) - Remove model name truncation in CLI results table - Merge model card slurm and distributed config into build_manifest.json - Add skip_monitoring flag for synchronous salloc execution - Add completion marker file for robust sbatch job completion detection Made-with: Cursor --- src/madengine/cli/commands/build.py | 4 +- src/madengine/cli/utils.py | 4 +- src/madengine/cli/validators.py | 44 +++++++++++++++-- src/madengine/deployment/base.py | 4 +- src/madengine/deployment/slurm.py | 47 +++++++++++++++++++ .../orchestration/build_orchestrator.py | 35 +++++++++----- 6 files changed, 119 insertions(+), 19 deletions(-) diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index 091bf843..594d0661 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -208,8 +208,8 @@ def build( ) try: - # Validate additional context - validate_additional_context(additional_context, additional_context_file) + # Validate additional context (gpu_vendor/guest_os optional when using pre-built image) + validate_additional_context(additional_context, additional_context_file, use_image) # Create arguments object args = create_args_namespace( diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index 500232d7..a02f0421 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -165,7 +165,7 @@ def extract_model_name(item): else: model_name = docker_image return model_name - return str(item)[:20] + return str(item) # Helper function to format numbers def format_number(value): @@ -247,7 +247,7 @@ def format_number(value): row_index += 1 else: # Fallback for non-dict items - model_name = str(item)[:20] + model_name = str(item) if has_node_data: row = [str(row_index), "✅ Success", model_name, "node-0", "-", "-"] else: diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index 6bfc7bdb..6a5d5013 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -26,6 +26,7 @@ def validate_additional_context( additional_context: str, additional_context_file: Optional[str] = None, + use_image: Optional[str] = None, ) -> Dict[str, str]: """ Validate and parse additional context. @@ -33,6 +34,7 @@ def validate_additional_context( Args: additional_context: JSON string containing additional context additional_context_file: Optional file containing additional context + use_image: Optional pre-built image to use (skips required field validation) Returns: Dict containing parsed additional context @@ -65,6 +67,36 @@ def validate_additional_context( console.print("💡 Please provide valid JSON format") raise typer.Exit(ExitCode.INVALID_ARGS) + # When using pre-built image, gpu_vendor and guest_os are optional + # They will be extracted from model card env_vars if needed + using_prebuilt_image = use_image and use_image.lower() not in ["none", ""] + + if using_prebuilt_image: + # For pre-built images, context is optional - will use model card env_vars + if not context: + console.print("ℹ️ No additional context provided (using pre-built image)") + console.print("💡 gpu_vendor and guest_os will be read from model card env_vars if needed") + return {} + + # If context provided, validate any gpu_vendor/guest_os values present + if "gpu_vendor" in context: + gpu_vendor = context["gpu_vendor"].upper() + if gpu_vendor not in VALID_GPU_VENDORS: + console.print(f"❌ Invalid gpu_vendor: [red]{context['gpu_vendor']}[/red]") + console.print(f"💡 Supported values: [green]{', '.join(VALID_GPU_VENDORS)}[/green]") + raise typer.Exit(ExitCode.INVALID_ARGS) + + if "guest_os" in context: + guest_os = context["guest_os"].upper() + if guest_os not in VALID_GUEST_OS: + console.print(f"❌ Invalid guest_os: [red]{context['guest_os']}[/red]") + console.print(f"💡 Supported values: [green]{', '.join(VALID_GUEST_OS)}[/green]") + raise typer.Exit(ExitCode.INVALID_ARGS) + + console.print("✅ Context validated (pre-built image mode)") + return context + + # For Dockerfile builds, require gpu_vendor and guest_os if not context: console.print("❌ [red]No additional context provided[/red]") console.print( @@ -81,14 +113,17 @@ def validate_additional_context( [bold cyan]Required fields:[/bold cyan] • gpu_vendor: [green]AMD[/green], [green]NVIDIA[/green] -• guest_os: [green]UBUNTU[/green], [green]CENTOS[/green]""", +• guest_os: [green]UBUNTU[/green], [green]CENTOS[/green] + +[bold cyan]Or use a pre-built image:[/bold cyan] +madengine build --tags dummy --use-image auto""", title="Additional Context Help", border_style="blue", ) console.print(example_panel) raise typer.Exit(ExitCode.INVALID_ARGS) - # Validate required fields + # Validate required fields for Dockerfile builds required_fields = ["gpu_vendor", "guest_os"] missing_fields = [field for field in required_fields if field not in context] @@ -97,7 +132,10 @@ def validate_additional_context( f"❌ Missing required fields: [red]{', '.join(missing_fields)}[/red]" ) console.print( - "💡 Both gpu_vendor and guest_os are required for build operations" + "💡 Both gpu_vendor and guest_os are required for Dockerfile builds" + ) + console.print( + "💡 Or use --use-image to skip Dockerfile build" ) raise typer.Exit(ExitCode.INVALID_ARGS) diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py index 33a338a9..090b2f5a 100644 --- a/src/madengine/deployment/base.py +++ b/src/madengine/deployment/base.py @@ -51,6 +51,7 @@ class DeploymentResult: metrics: Optional[Dict[str, Any]] = None logs_path: Optional[str] = None artifacts: Optional[List[str]] = None + skip_monitoring: bool = False # Set True for synchronous runs (e.g., inside salloc) @property def is_success(self) -> bool: @@ -164,7 +165,8 @@ def execute(self) -> DeploymentResult: return result # Step 4: Monitor (optional) - if self.config.monitor: + # Skip monitoring if deploy() already ran synchronously (e.g., inside salloc) + if self.config.monitor and not result.skip_monitoring: result = self._monitor_until_complete(result.deployment_id) # Step 5: Collect Results (always collect, even on failure to record failed runs) diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index e38fffaa..e9f8c62f 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -682,6 +682,10 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo ''", ]) + # Create completion marker path for robust completion detection + # Use absolute path since script will cd to different directory + completion_marker = (self.output_dir / f"madengine_{model_info['name']}.complete").resolve() + script_lines.extend([ "", "# Change to script directory", @@ -690,11 +694,22 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "# Run the model script directly on baremetal", f"echo 'Executing: bash {model_script_path.name} {model_args}'", f"bash {model_script_path.name} {model_args}", + "SCRIPT_EXIT_CODE=$?", "", "echo ''", "echo 'Script completed.'", + "", + "# Write completion marker for madengine to detect", + f"echo \"exit_code=$SCRIPT_EXIT_CODE\" > {completion_marker}", + f"echo \"timestamp=$(date -Iseconds)\" >> {completion_marker}", + f"echo 'Completion marker written: {completion_marker}'", + "", + "exit $SCRIPT_EXIT_CODE", ]) + # Store marker path for monitor to check + self._completion_marker = completion_marker + script_content = "\n".join(script_lines) # Save script @@ -1212,6 +1227,7 @@ def _run_inside_existing_allocation(self) -> DeploymentResult: deployment_id=self.existing_job_id, message=f"Completed inside existing allocation {self.existing_job_id}", logs_path=str(self.output_dir), + skip_monitoring=True, # Already ran synchronously, no need to poll ) else: self.console.print( @@ -1222,6 +1238,7 @@ def _run_inside_existing_allocation(self) -> DeploymentResult: deployment_id=self.existing_job_id, message=f"Script failed with exit code {result.returncode}", logs_path=str(self.output_dir), + skip_monitoring=True, # Already ran synchronously ) except subprocess.TimeoutExpired: @@ -1337,6 +1354,36 @@ def monitor(self, deployment_id: str) -> DeploymentResult: message=f"Completed (ran inside existing allocation {deployment_id})", ) + # Check for completion marker (robust detection for interactive/salloc jobs) + if hasattr(self, '_completion_marker') and self._completion_marker: + marker_path = Path(self._completion_marker) + if marker_path.exists(): + # Read exit code from marker + try: + content = marker_path.read_text() + exit_code = 0 + for line in content.splitlines(): + if line.startswith("exit_code="): + exit_code = int(line.split("=")[1]) + break + + self.console.print(f"[green]✓ Completion marker found: {marker_path}[/green]") + + if exit_code == 0: + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Script completed successfully (exit code {exit_code})", + ) + else: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Script failed with exit code {exit_code}", + ) + except Exception as e: + self.console.print(f"[yellow]Warning: Could not read completion marker: {e}[/yellow]") + try: # Query job status using squeue (runs locally) result = subprocess.run( diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index f664e87e..5fc5e0f9 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -577,24 +577,37 @@ def _execute_with_prebuilt_image( # Save deployment config self._save_deployment_config(manifest_output) - # Merge model's distributed config (especially launcher) into deployment_config - # This ensures sglang-disagg launcher is in deployment_config even if not in additional-context - if models and models[0].get("distributed"): + # Merge model's distributed and slurm config into deployment_config + # This ensures launcher and slurm settings are in deployment_config even if not in additional-context + if models: with open(manifest_output, "r") as f: saved_manifest = json.load(f) - model_distributed = models[0].get("distributed", {}) if "deployment_config" not in saved_manifest: saved_manifest["deployment_config"] = {} - # Merge model's distributed into deployment_config.distributed - if "distributed" not in saved_manifest["deployment_config"]: - saved_manifest["deployment_config"]["distributed"] = {} + # Merge model's distributed config + model_distributed = models[0].get("distributed", {}) + if model_distributed: + if "distributed" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["distributed"] = {} + + # Copy launcher and other critical fields from model config + for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg", "vllm_disagg"]: + if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: + saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] - # Copy launcher and other critical fields from model config - for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg"]: - if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: - saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] + # Merge model's slurm config into deployment_config.slurm + # This enables run phase to auto-detect SLURM deployment without --additional-context + model_slurm = models[0].get("slurm", {}) + if model_slurm: + if "slurm" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["slurm"] = {} + + # Copy slurm settings from model config + for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir"]: + if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: + saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] with open(manifest_output, "w") as f: json.dump(saved_manifest, f, indent=2) From d7073973cfc01bc5286dbaa91a86102dc16bbc40 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Mar 2026 07:52:47 +0000 Subject: [PATCH 10/12] Add retry loop for perf.csv detection after SLURM job completion On shared NFS filesystems, perf.csv written by compute nodes may not be immediately visible to the login node. Retry up to 30s to allow NFS propagation before reporting the file as missing. Made-with: Cursor --- src/madengine/deployment/slurm.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index e9f8c62f..071f7f9c 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -1637,11 +1637,16 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: # Strategy 2: Check shared workspace (NFS) for perf.csv # When using shared storage, perf.csv is written directly to workspace + # Retry briefly to allow NFS propagation after job completion if not results["perf_files"]: workspace_perf = Path("perf.csv") - if workspace_perf.exists(): - results["perf_files"] = [str(workspace_perf)] - self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]") + for _attempt in range(6): + if workspace_perf.exists(): + results["perf_files"] = [str(workspace_perf)] + self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]") + break + import time + time.sleep(5) # Parse perf.csv to populate successful_runs and failed_runs # Filter based on session_start_row passed as parameter (no external files!) From 2d9a5cc549248611b38a290f191b0e6da5330fc3 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Mar 2026 23:01:00 +0000 Subject: [PATCH 11/12] Add nodelist support to slurm_multi baremetal scripts and rename Launcher column - Emit #SBATCH --nodelist in _prepare_baremetal_script() when nodelist is set via model card or --additional-context - Include nodelist in model card slurm merge key list so it propagates to the build manifest - Rename perf table column from "Launcher" to "Workload" for clarity Made-with: Cursor --- src/madengine/cli/utils.py | 4 ++-- src/madengine/deployment/slurm.py | 5 +++++ src/madengine/orchestration/build_orchestrator.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index a02f0421..d7af9c7e 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -378,7 +378,7 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row perf_table.add_column("Index", justify="right", style="dim") perf_table.add_column("Model", style="cyan") perf_table.add_column("Topology", justify="center", style="blue") - perf_table.add_column("Launcher", justify="center", style="magenta") # Distributed launcher + perf_table.add_column("Workload", justify="center", style="magenta") perf_table.add_column("Deployment", justify="center", style="cyan") perf_table.add_column("GPU Arch", style="yellow") perf_table.add_column("Performance", justify="right", style="green") @@ -475,7 +475,7 @@ def format_performance(perf): str(idx), model, topology, - launcher, # Distributed launcher (docker, torchrun, vllm, etc.) + launcher, # Workload type (sglang-disagg, vllm, torchrun, etc.) deployment_type, gpu_arch, performance, diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 071f7f9c..fe588954 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -616,6 +616,11 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N if self.reservation: script_lines.append(f"#SBATCH --reservation={self.reservation}") + # Add nodelist if specified (from model card or --additional-context) + nodelist = self._normalize_nodelist(self.slurm_config.get("nodelist")) + if nodelist: + script_lines.append(f"#SBATCH --nodelist={nodelist}") + script_lines.extend([ "", f"# Baremetal launcher script for {model_info['name']}", diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 5fc5e0f9..7f39ee37 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -605,7 +605,7 @@ def _execute_with_prebuilt_image( saved_manifest["deployment_config"]["slurm"] = {} # Copy slurm settings from model config - for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir"]: + for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir", "nodelist"]: if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] From fe22e7e861966345b8ddb3ec4fd88edd37ec03b9 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Mar 2026 23:47:17 +0000 Subject: [PATCH 12/12] Fix deep-merge of slurm config to preserve model card values (nodes, time) When --additional-context provides partial slurm overrides (e.g. only nodelist), ConfigLoader defaults were overwriting model card values like nodes and time. Fixed by capturing original user slurm keys before ConfigLoader runs, and deep-merging manifest slurm config with runtime context instead of replacing it wholesale. Made-with: Cursor --- .../orchestration/build_orchestrator.py | 9 +++++-- .../orchestration/run_orchestrator.py | 24 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 7f39ee37..f3a04f7a 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -84,6 +84,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None): merged_context.update(additional_context) self.additional_context = merged_context + self._original_user_slurm_keys = set(merged_context.get("slurm", {}).keys()) # Apply ConfigLoader to infer deploy type, validate, and apply defaults if self.additional_context: @@ -604,9 +605,13 @@ def _execute_with_prebuilt_image( if "slurm" not in saved_manifest["deployment_config"]: saved_manifest["deployment_config"]["slurm"] = {} - # Copy slurm settings from model config + # Copy slurm settings from model config (model card fills in + # values not explicitly set by --additional-context). + # Use _original_user_slurm_keys (captured before ConfigLoader + # applies defaults) so model card values override defaults + # but user's explicit CLI values still win. for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir", "nodelist"]: - if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: + if key in model_slurm and key not in self._original_user_slurm_keys: saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] with open(manifest_output, "w") as f: diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 42032fb1..7101a54b 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -217,10 +217,17 @@ def execute( if not self.additional_context: self.additional_context = {} - # Merge deployment_config into additional_context (for deployment layer to use) + # Merge deployment_config into additional_context (for deployment layer to use). + # For dict-valued keys (slurm, k8s, etc.), deep-merge so manifest + # values fill in gaps while runtime --additional-context wins on conflicts. for key in ["slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: - if key in deployment_config and key not in self.additional_context: - self.additional_context[key] = deployment_config[key] + if key in deployment_config: + if key not in self.additional_context: + self.additional_context[key] = deployment_config[key] + elif isinstance(deployment_config[key], dict) and isinstance(self.additional_context[key], dict): + merged = dict(deployment_config[key]) + merged.update(self.additional_context[key]) + self.additional_context[key] = merged # Infer deployment target from config structure (Convention over Configuration) # No explicit "deploy" field needed - presence of k8s/slurm indicates deployment type @@ -445,10 +452,17 @@ def _load_and_merge_manifest(self, manifest_file: str) -> str: # Merge deployment_config if "deployment_config" in manifest: stored_config = manifest["deployment_config"] - # Runtime --additional-context overrides stored config + # Runtime --additional-context overrides stored config. + # For dict-valued keys, deep-merge so manifest values fill + # in gaps (e.g. nodes, time) while runtime values win on conflicts. for key in ["deploy", "slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: if key in self.additional_context: - stored_config[key] = self.additional_context[key] + if key in stored_config and isinstance(stored_config[key], dict) and isinstance(self.additional_context[key], dict): + merged = dict(stored_config[key]) + merged.update(self.additional_context[key]) + stored_config[key] = merged + else: + stored_config[key] = self.additional_context[key] manifest["deployment_config"] = stored_config # Merge context (tools, pre_scripts, post_scripts, encapsulate_script)