diff --git a/examples/profiling-configs/rocprofv3_multi_gpu.json b/examples/profiling-configs/rocprofv3_multi_gpu.json index f8e16f89..2b2b250d 100644 --- a/examples/profiling-configs/rocprofv3_multi_gpu.json +++ b/examples/profiling-configs/rocprofv3_multi_gpu.json @@ -12,7 +12,7 @@ "nodes": 1, "gpus_per_node": 4, "time": "02:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": false }, diff --git a/examples/profiling-configs/rocprofv3_multi_node.json b/examples/profiling-configs/rocprofv3_multi_node.json index 935f969a..ad87c814 100644 --- a/examples/profiling-configs/rocprofv3_multi_node.json +++ b/examples/profiling-configs/rocprofv3_multi_node.json @@ -12,7 +12,7 @@ "nodes": 2, "gpus_per_node": 4, "time": "04:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/README.md b/examples/slurm-configs/README.md index 6bc69264..8b3afe9d 100644 --- a/examples/slurm-configs/README.md +++ b/examples/slurm-configs/README.md @@ -379,7 +379,7 @@ madengine uses intelligent multi-layer configuration merging: "nodelist": "node01,node02", // Optional: restrict job to these nodes (skips node health preflight) "gpus_per_node": 8, // GPUs per node "time": "24:00:00", // Wall time (HH:MM:SS) - "output_dir": "./slurm_output", // Local output directory + "output_dir": "./slurm_results", // Local output directory "results_dir": "/shared/results", // Shared results collection "shared_workspace": "/shared/workspace", // Shared workspace (NFS/Lustre) "exclusive": true, // Exclusive node access @@ -584,10 +584,10 @@ squeue -u $USER scontrol show job # View output logs (real-time) -tail -f slurm_output/madengine-*__*.out +tail -f slurm_results/madengine-*__*.out # View error logs -tail -f slurm_output/madengine-*__*.err +tail -f slurm_results/madengine-*__*.err # Cancel job if needed scancel @@ -600,7 +600,7 @@ scancel - Check SLURM partition exists: `sinfo` - Verify GPU resources available: `sinfo -o "%P %.5a %.10l %.6D %.6t %N %G"` - Check SLURM account/QoS settings -- Review job script: `slurm_output/madengine_*.sh` +- Review job script: `slurm_results/madengine_*.sh` ### Out of Memory Errors @@ -715,8 +715,8 @@ MODEL_DIR=models/my-model madengine run \ watch squeue -u $USER # 6. Check logs when complete -ls -lh slurm_output/ -tail -f slurm_output/madengine-*__*.out +ls -lh slurm_results/ +tail -f slurm_results/madengine-*__*.out ``` ### vLLM Inference Workflow @@ -739,7 +739,7 @@ MODEL_DIR=models/llama2-70b madengine run \ --manifest-file build_manifest.json # 5. Monitor for OOM errors -tail -f slurm_output/madengine-*__*.err | grep -i "memory" +tail -f slurm_results/madengine-*__*.err | grep -i "memory" # 6. If OOM occurs, adjust config and rebuild # Edit your config file to set VLLM_KV_CACHE_SIZE to 0.6 or 0.7 diff --git a/examples/slurm-configs/basic/01-single-node-single-gpu.json b/examples/slurm-configs/basic/01-single-node-single-gpu.json index 52324c1a..c0877717 100644 --- a/examples/slurm-configs/basic/01-single-node-single-gpu.json +++ b/examples/slurm-configs/basic/01-single-node-single-gpu.json @@ -12,7 +12,7 @@ "nodes": 1, "gpus_per_node": 1, "time": "01:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": false }, diff --git a/examples/slurm-configs/basic/02-single-node-multi-gpu.json b/examples/slurm-configs/basic/02-single-node-multi-gpu.json index f72fd0e2..a0e5b6ae 100644 --- a/examples/slurm-configs/basic/02-single-node-multi-gpu.json +++ b/examples/slurm-configs/basic/02-single-node-multi-gpu.json @@ -12,7 +12,7 @@ "nodes": 1, "gpus_per_node": 8, "time": "12:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/03-multi-node-basic-nodelist.json b/examples/slurm-configs/basic/03-multi-node-basic-nodelist.json index b8a7ea0a..8a0323ad 100644 --- a/examples/slurm-configs/basic/03-multi-node-basic-nodelist.json +++ b/examples/slurm-configs/basic/03-multi-node-basic-nodelist.json @@ -10,7 +10,7 @@ "nodelist": "node01,node02", "gpus_per_node": 8, "time": "24:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true, "network_interface": "eth0" }, diff --git a/examples/slurm-configs/basic/03-multi-node-basic.json b/examples/slurm-configs/basic/03-multi-node-basic.json index 9c1f2de0..006890a7 100644 --- a/examples/slurm-configs/basic/03-multi-node-basic.json +++ b/examples/slurm-configs/basic/03-multi-node-basic.json @@ -12,7 +12,7 @@ "nodes": 2, "gpus_per_node": 8, "time": "24:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true, "network_interface": "eth0" }, diff --git a/examples/slurm-configs/basic/04-multi-node-advanced.json b/examples/slurm-configs/basic/04-multi-node-advanced.json index bf30fb0b..1708f078 100644 --- a/examples/slurm-configs/basic/04-multi-node-advanced.json +++ b/examples/slurm-configs/basic/04-multi-node-advanced.json @@ -12,7 +12,7 @@ "nodes": 4, "gpus_per_node": 8, "time": "48:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "results_dir": "/shared/results", "shared_workspace": "/shared/workspace", "exclusive": true, diff --git a/examples/slurm-configs/basic/05-vllm-single-node.json b/examples/slurm-configs/basic/05-vllm-single-node.json index a632bc0d..7d77c4df 100644 --- a/examples/slurm-configs/basic/05-vllm-single-node.json +++ b/examples/slurm-configs/basic/05-vllm-single-node.json @@ -12,7 +12,7 @@ "nodes": 1, "gpus_per_node": 4, "time": "02:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/06-vllm-multi-node.json b/examples/slurm-configs/basic/06-vllm-multi-node.json index b76d34b9..d51262db 100644 --- a/examples/slurm-configs/basic/06-vllm-multi-node.json +++ b/examples/slurm-configs/basic/06-vllm-multi-node.json @@ -19,7 +19,7 @@ "nodes": 2, "gpus_per_node": 4, "time": "00:45:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true, "_comment_node_check": "Preflight GPU health check (helps avoid OOM from stale processes)", diff --git a/examples/slurm-configs/basic/07-sglang-single-node.json b/examples/slurm-configs/basic/07-sglang-single-node.json index ad82947a..8aaae928 100644 --- a/examples/slurm-configs/basic/07-sglang-single-node.json +++ b/examples/slurm-configs/basic/07-sglang-single-node.json @@ -12,7 +12,7 @@ "nodes": 1, "gpus_per_node": 4, "time": "02:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/08-sglang-multi-node.json b/examples/slurm-configs/basic/08-sglang-multi-node.json index 7519b513..8485b93c 100644 --- a/examples/slurm-configs/basic/08-sglang-multi-node.json +++ b/examples/slurm-configs/basic/08-sglang-multi-node.json @@ -12,7 +12,7 @@ "nodes": 2, "gpus_per_node": 4, "time": "04:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/cluster-amd-rccl.json b/examples/slurm-configs/basic/cluster-amd-rccl.json index e15c0a6f..e70f8721 100644 --- a/examples/slurm-configs/basic/cluster-amd-rccl.json +++ b/examples/slurm-configs/basic/cluster-amd-rccl.json @@ -19,7 +19,7 @@ "nodes": 1, "gpus_per_node": 8, "time": "12:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/sglang-disagg-custom-split.json index 291a5938..f38bcf64 100644 --- a/examples/slurm-configs/basic/sglang-disagg-custom-split.json +++ b/examples/slurm-configs/basic/sglang-disagg-custom-split.json @@ -20,7 +20,7 @@ "nodes": 7, "gpus_per_node": 8, "time": "04:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/sglang-disagg-multi-node.json index 0c5ec00d..7dfbae19 100644 --- a/examples/slurm-configs/basic/sglang-disagg-multi-node.json +++ b/examples/slurm-configs/basic/sglang-disagg-multi-node.json @@ -19,7 +19,7 @@ "nodes": 5, "gpus_per_node": 8, "time": "04:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true }, diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index a63d3658..214b966c 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -352,7 +352,6 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row console.print("[yellow]⚠️ Performance CSV is empty[/yellow]") return - # Get session_start_row to mark current runs (don't filter, just mark) total_rows = len(df) # Try parameter first, then fall back to marker file @@ -378,7 +377,7 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row perf_table.add_column("Index", justify="right", style="dim") perf_table.add_column("Model", style="cyan") perf_table.add_column("Topology", justify="center", style="blue") - perf_table.add_column("Launcher", justify="center", style="magenta") # Distributed launcher + perf_table.add_column("Launcher", justify="center", style="magenta") perf_table.add_column("Deployment", justify="center", style="cyan") perf_table.add_column("GPU Arch", style="yellow") perf_table.add_column("Performance", justify="right", style="green") @@ -388,12 +387,15 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row perf_table.add_column("Data Name", style="magenta") perf_table.add_column("Data Provider", style="magenta") - # Helper function to format duration + # Helper function to format duration (accepts float seconds or "Xs" string) def format_duration(duration): - if pd.isna(duration) or duration == "": + if pd.isna(duration) or duration == "" or duration is None: return "N/A" try: - dur = float(duration) + if isinstance(duration, str) and duration.strip().endswith("s"): + dur = float(duration.strip()[:-1]) + else: + dur = float(duration) if dur < 1: return f"{dur*1000:.0f}ms" elif dur < 60: @@ -401,7 +403,7 @@ def format_duration(duration): else: return f"{dur/60:.1f}m" except (ValueError, TypeError): - return "N/A" + return str(duration) if duration else "N/A" # Helper function to format performance def format_performance(perf): diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index c98980a8..8ca10483 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -94,42 +94,66 @@ def _load_credentials(): CREDS = _load_credentials() +# Default value used for NAS_NODES when neither env nor creds provide it. +_DEFAULT_NAS_NODES = [ + { + "NAME": "DEFAULT", + "HOST": "localhost", + "PORT": 22, + "USERNAME": "username", + "PASSWORD": "password", + } +] + +# Default value used for MAD_AWS_S3 when neither env nor creds provide it. +_DEFAULT_MAD_AWS_S3 = {"USERNAME": None, "PASSWORD": None} + +# Default value used for MAD_MINIO when neither env nor creds provide it. +_DEFAULT_MAD_MINIO = { + "USERNAME": None, + "PASSWORD": None, + "MINIO_ENDPOINT": "http://localhost:9000", + "AWS_ENDPOINT_URL_S3": "http://localhost:9000", +} + +# Default value used for PUBLIC_GITHUB_ROCM_KEY when neither env nor creds provide it. +_DEFAULT_PUBLIC_GITHUB_ROCM_KEY = {"username": None, "token": None} + + +def _get_env_or_creds_or_default(env_key: str, creds_key: str, default): + """Load config from env (JSON), creds file, or default. + + Priority: 1) environment variable (parsed as JSON), 2) credential file, 3) default. + Invalid JSON in env falls back to default with logging. + + Args: + env_key: Environment variable name (e.g. "NAS_NODES"). + creds_key: Key in CREDS dict (e.g. "NAS_NODES"). + default: Default value if env unset and creds missing or env JSON invalid. + + Returns: + Loaded value (same type as default, or from env/creds). + """ + if env_key not in os.environ: + _log_config_info(f"{env_key} environment variable is not set.") + if creds_key in CREDS: + _log_config_info(f"{creds_key} loaded from credentials file.") + return CREDS[creds_key] + _log_config_info(f"{creds_key} is using default values.") + return default + _log_config_info(f"{env_key} is loaded from env variables.") + try: + return json.loads(os.environ[env_key]) + except json.JSONDecodeError as e: + _log_config_info( + f"Error parsing {env_key} environment variable: {e}, using defaults" + ) + return default + def _get_nas_nodes(): """Initialize NAS_NODES configuration.""" - if "NAS_NODES" not in os.environ: - _log_config_info("NAS_NODES environment variable is not set.") - if "NAS_NODES" in CREDS: - _log_config_info("NAS_NODES loaded from credentials file.") - return CREDS["NAS_NODES"] - else: - _log_config_info("NAS_NODES is using default values.") - return [ - { - "NAME": "DEFAULT", - "HOST": "localhost", - "PORT": 22, - "USERNAME": "username", - "PASSWORD": "password", - } - ] - else: - _log_config_info("NAS_NODES is loaded from env variables.") - try: - return json.loads(os.environ["NAS_NODES"]) - except json.JSONDecodeError as e: - _log_config_info( - f"Error parsing NAS_NODES environment variable: {e}, using defaults" - ) - return [ - { - "NAME": "DEFAULT", - "HOST": "localhost", - "PORT": 22, - "USERNAME": "username", - "PASSWORD": "password", - } - ] + return _get_env_or_creds_or_default("NAS_NODES", "NAS_NODES", _DEFAULT_NAS_NODES) NAS_NODES = _get_nas_nodes() @@ -137,94 +161,36 @@ def _get_nas_nodes(): def _get_mad_aws_s3(): """Initialize MAD_AWS_S3 configuration.""" - if "MAD_AWS_S3" not in os.environ: - _log_config_info("MAD_AWS_S3 environment variable is not set.") - if "MAD_AWS_S3" in CREDS: - _log_config_info("MAD_AWS_S3 loaded from credentials file.") - return CREDS["MAD_AWS_S3"] - else: - _log_config_info("MAD_AWS_S3 is using default values.") - return { - "USERNAME": None, - "PASSWORD": None, - } - else: - _log_config_info("MAD_AWS_S3 is loaded from env variables.") - try: - return json.loads(os.environ["MAD_AWS_S3"]) - except json.JSONDecodeError as e: - _log_config_info( - f"Error parsing MAD_AWS_S3 environment variable: {e}, using defaults" - ) - return { - "USERNAME": None, - "PASSWORD": None, - } + return _get_env_or_creds_or_default("MAD_AWS_S3", "MAD_AWS_S3", _DEFAULT_MAD_AWS_S3) MAD_AWS_S3 = _get_mad_aws_s3() -# Check the MAD_MINIO environment variable which is a dict. def _get_mad_minio(): - """Initialize MAD_MINIO configuration.""" - if "MAD_MINIO" not in os.environ: - _log_config_info("MAD_MINIO environment variable is not set.") - if "MAD_MINIO" in CREDS: - _log_config_info("MAD_MINIO loaded from credentials file.") - return CREDS["MAD_MINIO"] - else: - _log_config_info("MAD_MINIO is using default values.") - return { - "USERNAME": None, - "PASSWORD": None, - "MINIO_ENDPOINT": "http://localhost:9000", - "AWS_ENDPOINT_URL_S3": "http://localhost:9000", - } - else: - _log_config_info("MAD_MINIO is loaded from env variables.") - try: - return json.loads(os.environ["MAD_MINIO"]) - except json.JSONDecodeError as e: - _log_config_info( - f"Error parsing MAD_MINIO environment variable: {e}, using defaults" - ) - return { - "USERNAME": None, - "PASSWORD": None, - "MINIO_ENDPOINT": "http://localhost:9000", - "AWS_ENDPOINT_URL_S3": "http://localhost:9000", - } + """Initialize MAD_MINIO configuration (dict with USERNAME, PASSWORD, MINIO_ENDPOINT, etc.).""" + return _get_env_or_creds_or_default("MAD_MINIO", "MAD_MINIO", _DEFAULT_MAD_MINIO) MAD_MINIO = _get_mad_minio() def _get_public_github_rocm_key(): - """Initialize PUBLIC_GITHUB_ROCM_KEY configuration.""" - if "PUBLIC_GITHUB_ROCM_KEY" not in os.environ: - _log_config_info("PUBLIC_GITHUB_ROCM_KEY environment variable is not set.") - if "PUBLIC_GITHUB_ROCM_KEY" in CREDS: - _log_config_info("PUBLIC_GITHUB_ROCM_KEY loaded from credentials file.") - return CREDS["PUBLIC_GITHUB_ROCM_KEY"] - else: - _log_config_info("PUBLIC_GITHUB_ROCM_KEY is using default values.") - return { - "username": None, - "token": None, - } - else: - _log_config_info("PUBLIC_GITHUB_ROCM_KEY is loaded from env variables.") - try: - return json.loads(os.environ["PUBLIC_GITHUB_ROCM_KEY"]) - except json.JSONDecodeError as e: - _log_config_info( - f"Error parsing PUBLIC_GITHUB_ROCM_KEY environment variable: {e}, using defaults" - ) - return { - "username": None, - "token": None, - } + """Initialize PUBLIC_GITHUB_ROCM_KEY configuration. + + Returned dict always has keys 'username' and 'token' (public API). + Credential files may use 'password' for the token; that is normalized to 'token'. + """ + raw = _get_env_or_creds_or_default( + "PUBLIC_GITHUB_ROCM_KEY", + "PUBLIC_GITHUB_ROCM_KEY", + _DEFAULT_PUBLIC_GITHUB_ROCM_KEY, + ) + if not isinstance(raw, dict): + return dict(_DEFAULT_PUBLIC_GITHUB_ROCM_KEY) + # Normalize so public API always has username + token (accept creds that use "password") + token = raw.get("token") or raw.get("password") + return {"username": raw.get("username"), "token": token} PUBLIC_GITHUB_ROCM_KEY = _get_public_github_rocm_key() diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index e1d93b61..836ab81e 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -17,6 +17,7 @@ import collections.abc import os import re +import sys import typing # third-party modules @@ -397,8 +398,11 @@ def get_gpu_vendor(self) -> str: for amd_smi_path in amd_smi_paths: if os.path.exists(amd_smi_path): try: + # Debug: log to stderr so SLURM node .err captures where we are if killed + print(f"[DEBUG] get_gpu_vendor: trying amd-smi at {amd_smi_path}", file=sys.stderr, flush=True) # Verify amd-smi actually works (180s timeout for slow GPU initialization) result = self.console.sh(f"{amd_smi_path} list > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + print(f"[DEBUG] get_gpu_vendor: amd-smi returned", file=sys.stderr, flush=True) if result and result.strip() == "AMD": return "AMD" except Exception as e: @@ -408,7 +412,9 @@ def get_gpu_vendor(self) -> str: rocm_smi_path = os.path.join(self._rocm_path, "bin", "rocm-smi") if os.path.exists(rocm_smi_path): try: + print(f"[DEBUG] get_gpu_vendor: trying rocm-smi at {rocm_smi_path}", file=sys.stderr, flush=True) result = self.console.sh(f"{rocm_smi_path} --showid > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + print(f"[DEBUG] get_gpu_vendor: rocm-smi returned", file=sys.stderr, flush=True) if result and result.strip() == "AMD": return "AMD" except Exception as e: diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py index 33a338a9..52bbe02f 100644 --- a/src/madengine/deployment/base.py +++ b/src/madengine/deployment/base.py @@ -14,11 +14,27 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional +from jinja2 import Environment, FileSystemLoader from rich.console import Console +def create_jinja_env(template_dir: Path) -> Environment: + """Create a Jinja2 Environment with common filters for deployment templates. + + Args: + template_dir: Path to the template directory (e.g. deployment/templates/slurm). + + Returns: + Jinja2 Environment with dirname and basename filters registered. + """ + env = Environment(loader=FileSystemLoader(str(template_dir))) + env.filters["dirname"] = lambda path: str(Path(path).parent) + env.filters["basename"] = lambda path: str(Path(path).name) + return env + + class DeploymentStatus(Enum): """Deployment status enumeration.""" @@ -133,6 +149,7 @@ def execute(self) -> DeploymentResult: Returns: DeploymentResult with status and metrics """ + result = None try: # Step 1: Validate self.console.print( @@ -179,6 +196,11 @@ def execute(self) -> DeploymentResult: return result + except KeyboardInterrupt: + if result is not None and getattr(result, "deployment_id", None): + self.cleanup(result.deployment_id) + self.console.print("\n[yellow]Cancelled deployment and cleaned up resources.[/yellow]") + raise except Exception as e: self.console.print(f"[red]Deployment error: {e}[/red]") return DeploymentResult( @@ -314,3 +336,306 @@ def cleanup(self, deployment_id: str) -> bool: """ pass + # ------------------------------------------------------------------------- + # Shared multi-node metrics aggregation (used by SLURM and Kubernetes) + # ------------------------------------------------------------------------- + + def _parse_performance_from_log( + self, log_content: str, model_name: str + ) -> Optional[Dict[str, Any]]: + """ + Parse node-local performance from log content. + + Expected format (from training scripts): + performance: + node_id: + local_gpus: + test_duration: s + + Args: + log_content: Raw log text (e.g. node stdout) + model_name: Model name for the record + + Returns: + Dict with performance, metric, node_id, local_gpus, duration, gpu_architecture, etc. + None if no performance line found. + """ + import re + + perf_pattern = r"performance:\s*([\d.]+)\s+(\S+)" + match = re.search(perf_pattern, log_content) + if not match: + return None + + value = float(match.group(1)) + metric = match.group(2) + + node_id_pattern = r"node_id:\s*(\d+)" + node_match = re.search(node_id_pattern, log_content) + node_id = int(node_match.group(1)) if node_match else None + + local_gpus_pattern = r"local_gpus:\s*(\d+)" + gpus_match = re.search(local_gpus_pattern, log_content) + local_gpus = int(gpus_match.group(1)) if gpus_match else 1 + + duration_pattern = r"test_duration:\s*([\d.]+)s" + duration_match = re.search(duration_pattern, log_content) + if duration_match: + duration = f"{duration_match.group(1)}s" + else: + # Also match container output: "Test Duration: 12.34 seconds" + alt_duration_pattern = r"Test Duration:\s*([\d.]+)\s*seconds" + alt_match = re.search(alt_duration_pattern, log_content, re.IGNORECASE) + duration = f"{alt_match.group(1)}s" if alt_match else "N/A" + + gpu_arch_pattern = r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)" + gpu_arch_match = re.search(gpu_arch_pattern, log_content) + gpu_arch = gpu_arch_match.group(1) if gpu_arch_match else "N/A" + + return { + "model": model_name, + "performance": value, + "metric": metric, + "node_id": node_id, + "local_gpus": local_gpus, + "duration": duration, + "test_duration": duration, + "gpu_architecture": gpu_arch, + "data_name": "N/A", + "data_provider": "N/A", + } + + def _determine_aggregation_method(self, metric_name: str) -> str: + """ + Determine how to aggregate a metric based on its name/type. + + Returns: + "sum", "average", or "max" + """ + metric_lower = metric_name.lower() + if any( + keyword in metric_lower + for keyword in [ + "throughput", + "samples_per_second", + "tokens_per_second", + "images_per_second", + "requests_per_second", + "qps", + "bandwidth", + "ops_per_second", + "samples/sec", + "tokens/sec", + ] + ): + return "sum" + if any( + keyword in metric_lower + for keyword in [ + "latency", + "time", + "duration", + "milliseconds", + "seconds", + "ttft", + "tpot", + "response_time", + "accuracy", + "precision", + "recall", + "f1", + "loss", + ] + ): + return "average" + if any( + keyword in metric_lower + for keyword in ["memory", "bytes", "ram", "vram", "gb", "mb"] + ): + return "max" + self.console.print( + f"[yellow]⚠ Unknown metric type '{metric_name}', using sum aggregation[/yellow]" + ) + return "sum" + + def _aggregate_node_metrics( + self, + per_node_metrics: List[Dict[str, Any]], + nnodes: int, + launcher_type: str, + ) -> Optional[Dict[str, Any]]: + """ + Aggregate per-node metrics into a single job-level record. + + Throughput metrics are summed; latency/accuracy averaged; memory max. + Used by both SLURM and Kubernetes for multi-node result aggregation. + """ + import statistics + + if not per_node_metrics: + return None + + first_metric = per_node_metrics[0] + metric_name = first_metric["metric"] + aggregation_method = self._determine_aggregation_method(metric_name) + + if aggregation_method == "sum": + aggregated_value = sum(m["performance"] for m in per_node_metrics) + method_desc = "sum_across_nodes" + elif aggregation_method == "average": + aggregated_value = statistics.mean(m["performance"] for m in per_node_metrics) + method_desc = "average_across_nodes" + elif aggregation_method == "max": + aggregated_value = max(m["performance"] for m in per_node_metrics) + method_desc = "max_across_nodes" + else: + aggregated_value = sum(m["performance"] for m in per_node_metrics) + method_desc = "sum_across_nodes (default)" + + perfs = [m["performance"] for m in per_node_metrics] + if len(perfs) > 1: + statistics_dict = { + "mean": statistics.mean(perfs), + "std_dev": statistics.stdev(perfs), + "min": min(perfs), + "max": max(perfs), + "coefficient_variation": ( + statistics.stdev(perfs) / statistics.mean(perfs) + if statistics.mean(perfs) > 0 + else 0 + ), + } + else: + statistics_dict = { + "mean": perfs[0], + "std_dev": 0, + "min": perfs[0], + "max": perfs[0], + "coefficient_variation": 0, + } + + gpu_arch = "N/A" + for m in per_node_metrics: + if m.get("gpu_architecture") and m["gpu_architecture"] != "N/A": + gpu_arch = m["gpu_architecture"] + break + + durations = [ + m.get("duration", m.get("test_duration", "N/A")) + for m in per_node_metrics + if m.get("duration", "N/A") != "N/A" or m.get("test_duration", "N/A") != "N/A" + ] + if durations: + duration_values = [] + for d in durations: + if isinstance(d, str) and d.endswith("s"): + try: + duration_values.append(float(d[:-1])) + except ValueError: + pass + duration = f"{max(duration_values):.2f}s" if duration_values else "N/A" + else: + duration = "N/A" + + total_gpus = sum(m.get("local_gpus", 1) for m in per_node_metrics) + gpus_per_node = per_node_metrics[0].get("local_gpus", 1) if per_node_metrics else 1 + + aggregated_record = { + "model": first_metric["model"], + "n_gpus": total_gpus, + "nnodes": nnodes, + "gpus_per_node": gpus_per_node, + "performance": aggregated_value, + "metric": metric_name, + "status": "SUCCESS", + "topology": f"{nnodes}N×{gpus_per_node}G", + "launcher": launcher_type or "N/A", + "deployment_type": getattr(self, "DEPLOYMENT_TYPE", "base"), + "gpu_architecture": gpu_arch, + "test_duration": duration, + "data_name": first_metric.get("data_name", "N/A"), + "data_provider": first_metric.get("data_provider", "N/A"), + "aggregation_method": method_desc, + "nodes_contributing": len(per_node_metrics), + "per_node_mean": statistics_dict["mean"], + "per_node_std_dev": statistics_dict["std_dev"], + "per_node_cv": statistics_dict["coefficient_variation"], + } + return aggregated_record + + def _ensure_perf_csv_exists(self) -> None: + """Ensure perf.csv exists with standard header (for appending aggregated rows).""" + perf_csv_path = Path("perf.csv") + if perf_csv_path.exists(): + return + standard_header = ( + "model,n_gpus,nnodes,gpus_per_node,training_precision,pipeline,args,tags," + "docker_file,base_docker,docker_sha,docker_image,git_commit,machine_name," + "deployment_type,launcher,gpu_architecture,performance,metric,relative_change," + "status,build_duration,test_duration,dataname,data_provider_type,data_size," + "data_download_duration,build_number,additional_docker_run_options" + ) + perf_csv_path.write_text(standard_header + "\n", encoding="utf-8") + + def _write_to_perf_csv(self, perf_data: Dict[str, Any]) -> None: + """ + Append one row to perf.csv. Uses existing header if file exists for column order. + """ + import csv + + perf_csv_path = Path("perf.csv") + default_headers = [ + "model", + "n_gpus", + "nnodes", + "gpus_per_node", + "training_precision", + "pipeline", + "args", + "tags", + "docker_file", + "base_docker", + "docker_sha", + "docker_image", + "git_commit", + "machine_name", + "deployment_type", + "launcher", + "gpu_architecture", + "performance", + "metric", + "relative_change", + "status", + "build_duration", + "test_duration", + "dataname", + "data_provider_type", + "data_size", + "data_download_duration", + "build_number", + "additional_docker_run_options", + ] + file_exists = perf_csv_path.exists() + existing_header = None + if file_exists: + with open( + perf_csv_path, "r", newline="", encoding="utf-8", errors="replace" + ) as rf: + reader = csv.reader(rf) + existing_header = next(reader, None) + headers = existing_header if existing_header else default_headers + if file_exists and existing_header: + row_by_name = {k: perf_data.get(k, "") for k in headers} + row_to_write = [str(row_by_name.get(h, "")) for h in headers] + else: + row_to_write = perf_data + + with open(perf_csv_path, "a", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=headers, extrasaction="ignore") + if not file_exists: + writer.writeheader() + if file_exists and existing_header: + csv.writer(f).writerow(row_to_write) + else: + writer.writerow(row_to_write) + diff --git a/src/madengine/deployment/common.py b/src/madengine/deployment/common.py new file mode 100644 index 00000000..a122818c --- /dev/null +++ b/src/madengine/deployment/common.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +""" +Shared deployment utilities used by both SLURM and Kubernetes deployments. + +Provides launcher normalization, ROCm profiling checks, and multi-node +profiling configuration so logic is not duplicated across deployment modules. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import subprocess +from typing import Any, Dict, List, Optional + +# Valid distributed launchers (used by normalize_launcher) +VALID_LAUNCHERS = [ + "torchrun", + "torchtitan", + "deepspeed", + "megatron-lm", + "vllm", + "sglang", + "sglang-disagg" +] + + +def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> str: + """ + Normalize launcher field based on deployment type and launcher value. + + Logic: + - If launcher is in VALID_LAUNCHERS: keep as-is + - If launcher is None/empty/invalid: + * local → "docker" (runs in Docker container) + * slurm → "docker" (typically uses containers on compute nodes) + * kubernetes → "native" (pod itself is the container) + + Args: + launcher_type: Raw launcher type from config (may be None) + deployment_type: "local", "slurm", or "kubernetes" + + Returns: + Normalized launcher string + """ + if launcher_type and launcher_type in VALID_LAUNCHERS: + return launcher_type + if deployment_type == "local": + return "docker" + if deployment_type == "slurm": + return "docker" + if deployment_type == "kubernetes": + return "native" + return "docker" + + +def is_rocprofv3_available() -> bool: + """ + Check if rocprofv3 is available on the system. + + rocprofv3 is required for multi-node profiling with MPI support. + It's part of rocprofiler-sdk package in ROCm >= 6.4.1. + + Returns: + True if rocprofv3 is available and executable, False otherwise + """ + try: + result = subprocess.run( + ["rocprofv3", "--help"], + capture_output=True, + text=True, + timeout=5 + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): + return False + + +def configure_multi_node_profiling( + nnodes: int, + tools_config: List[Dict], + logger: Any +) -> Dict[str, Any]: + """ + Configure profiling for multi-node runs with rocprofv3 support. + + Industry best practice for multi-node profiling: + - Profile ALL nodes to detect stragglers, load imbalances, and communication bottlenecks + - Use rocprofv3 (MPI-aware) for distributed profiling + - Collect per-node outputs for detailed analysis + + Logic: + 1. Single node (nnodes == 1): Use existing tool behavior + 2. Multi-node (nnodes > 1): + a. Check if rocprofv3 is available + b. If available: Enable per-node profiling, upgrade "rocprof" to "rocprofv3" + c. If not available: Log warning and skip profiling + + Args: + nnodes: Number of nodes in the deployment + tools_config: List of tool configurations from user + logger: Logger instance for messages + + Returns: + Dictionary with profiling configuration: + - enabled: bool + - mode: str - "single_node", "multi_node", or "multi_node_unsupported" + - tools: List[Dict] + - per_node_collection: bool + """ + if nnodes == 1: + return { + "enabled": True, + "mode": "single_node", + "tools": tools_config, + "per_node_collection": False + } + + if not is_rocprofv3_available(): + logger.warning( + "╔════════════════════════════════════════════════════════════════════════════╗\n" + "║ Multi-Node Profiling Requirements Not Met ║\n" + "╠════════════════════════════════════════════════════════════════════════════╣\n" + "║ Multi-node profiling requires rocprofv3 (MPI-aware profiling support). ║\n" + "║ ║\n" + "║ Current Status: rocprofv3 NOT FOUND on system ║\n" + "║ ║\n" + "║ Profiling will be SKIPPED for this multi-node run. ║\n" + "║ ║\n" + "║ To enable multi-node profiling: ║\n" + "║ • Install rocprofiler-sdk package (ROCm >= 6.4.1) ║\n" + "║ • Command: apt install rocprofiler-sdk ║\n" + "║ • Or upgrade to ROCm 6.4.1 or later ║\n" + "║ ║\n" + "║ Note: Single-node profiling uses rocprof (no rocprofv3 required) ║\n" + "╚════════════════════════════════════════════════════════════════════════════╝" + ) + return { + "enabled": False, + "mode": "multi_node_unsupported", + "tools": [], + "per_node_collection": False + } + + logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") + + upgraded_tools: List[Dict] = [] + for tool in tools_config: + if not isinstance(tool, dict): + upgraded_tools.append(tool) + continue + tool_name = tool.get("name") + if tool_name == "rocprof": + logger.info( + " → Upgrading 'rocprof' to 'rocprofv3' for multi-node MPI compatibility" + ) + upgraded_tool = tool.copy() + upgraded_tool["name"] = "rocprofv3" + upgraded_tools.append(upgraded_tool) + else: + upgraded_tools.append(tool) + + if upgraded_tools: + tool_names = [ + t.get("name") if isinstance(t, dict) else str(t) for t in upgraded_tools + ] + logger.info(f" → Multi-node profiling tools: {', '.join(filter(None, tool_names))}") + if "rccl_trace" in tool_names: + logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") + + return { + "enabled": True, + "mode": "multi_node", + "tools": upgraded_tools, + "per_node_collection": True, + "profiler": "rocprofv3", + "wrapper_mode": "launcher" + } diff --git a/src/madengine/deployment/config_loader.py b/src/madengine/deployment/config_loader.py index 5afbe7b7..fdbf3c94 100644 --- a/src/madengine/deployment/config_loader.py +++ b/src/madengine/deployment/config_loader.py @@ -13,10 +13,27 @@ import json import os from pathlib import Path -from typing import Dict, Any, Optional +from typing import Any, Callable, Dict, Optional from copy import deepcopy +def apply_deployment_config(config: Any, load_fn: Callable[[Dict[str, Any]], Dict[str, Any]]) -> Dict[str, Any]: + """Apply deployment defaults via a loader and set config.additional_context. + + Used by SLURM and Kubernetes deployment classes before calling super().__init__(config). + + Args: + config: DeploymentConfig (or any object with additional_context attribute). + load_fn: ConfigLoader.load_slurm_config or ConfigLoader.load_k8s_config. + + Returns: + The merged full configuration dict. + """ + full_config = load_fn(config.additional_context or {}) + config.additional_context = full_config + return full_config + + class ConfigLoader: """Smart configuration loader with preset support.""" diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 29c9874e..d84f4384 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -33,14 +33,22 @@ except ImportError: YAML_AVAILABLE = False -from jinja2 import Environment, FileSystemLoader, Template - -from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus -from .config_loader import ConfigLoader +from jinja2 import Template + +from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env +from .common import ( + VALID_LAUNCHERS, + configure_multi_node_profiling, + is_rocprofv3_available, + normalize_launcher, +) +from .config_loader import ConfigLoader, apply_deployment_config from madengine.core.dataprovider import Data from madengine.core.context import Context from madengine.core.errors import ConfigurationError, create_error_context from madengine.utils.gpu_config import resolve_runtime_gpus +from madengine.utils.path_utils import get_madengine_root, scripts_base_dir_from +from madengine.utils.run_details import flatten_tags_in_place, get_build_number, get_pipeline try: from madengine.reporting.update_perf_csv import update_perf_csv @@ -50,184 +58,6 @@ REPORTING_AVAILABLE = False -# Valid distributed launchers -VALID_LAUNCHERS = [ - "torchrun", - "torchtitan", - "deepspeed", - "megatron-lm", - "vllm", - "sglang", - "sglang-disagg" -] - - -def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> str: - """ - Normalize launcher field based on deployment type and launcher value. - - Logic: - - If launcher is in VALID_LAUNCHERS: keep as-is - - If launcher is None/empty/invalid: - * local → "docker" (runs in Docker container) - * slurm → "docker" (typically uses containers on compute nodes) - * kubernetes → "native" (pod itself is the container) - - Args: - launcher_type: Raw launcher type from config (may be None) - deployment_type: "local", "slurm", or "kubernetes" - - Returns: - Normalized launcher string - """ - # If launcher is valid, keep it - if launcher_type and launcher_type in VALID_LAUNCHERS: - return launcher_type - - # Otherwise, default based on deployment type - if deployment_type == "local": - return "docker" - elif deployment_type == "slurm": - return "docker" - elif deployment_type == "kubernetes": - return "native" - else: - # Fallback for unknown deployment types - return "docker" - - -def is_rocprofv3_available() -> bool: - """ - Check if rocprofv3 is available on the system. - - rocprofv3 is required for multi-node profiling with MPI support. - It's part of rocprofiler-sdk package in ROCm >= 6.4.1. - - Returns: - True if rocprofv3 is available and executable, False otherwise - """ - try: - # Note: rocprofv3 doesn't support --version, use --help instead - result = subprocess.run( - ["rocprofv3", "--help"], - capture_output=True, - text=True, - timeout=5 - ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired, OSError): - return False - - -def configure_multi_node_profiling( - nnodes: int, - tools_config: List[Dict], - logger -) -> Dict[str, Any]: - """ - Configure profiling for multi-node runs with rocprofv3 support. - - Industry best practice for multi-node profiling: - - Profile ALL nodes to detect stragglers, load imbalances, and communication bottlenecks - - Use rocprofv3 (MPI-aware) for distributed profiling - - Collect per-node outputs for detailed analysis - - Logic: - 1. Single node (nnodes == 1): Use existing tool behavior - 2. Multi-node (nnodes > 1): - a. Check if rocprofv3 is available - b. If available: Enable per-node profiling, upgrade "rocprof" to "rocprofv3" - c. If not available: Log warning and skip profiling - - Args: - nnodes: Number of nodes in the deployment - tools_config: List of tool configurations from user - logger: Logger instance for messages - - Returns: - Dictionary with profiling configuration: - - enabled: bool - Whether profiling is enabled - - mode: str - "single_node", "multi_node", or "multi_node_unsupported" - - tools: List[Dict] - Processed tool configurations - - per_node_collection: bool - Whether to collect from all nodes - """ - if nnodes == 1: - # Single node - existing behavior works fine - return { - "enabled": True, - "mode": "single_node", - "tools": tools_config, - "per_node_collection": False - } - - # Multi-node case - check rocprofv3 availability - if not is_rocprofv3_available(): - logger.warning( - "╔════════════════════════════════════════════════════════════════════════════╗\n" - "║ Multi-Node Profiling Requirements Not Met ║\n" - "╠════════════════════════════════════════════════════════════════════════════╣\n" - "║ Multi-node profiling requires rocprofv3 (MPI-aware profiling support). ║\n" - "║ ║\n" - "║ Current Status: rocprofv3 NOT FOUND on system ║\n" - "║ ║\n" - "║ Profiling will be SKIPPED for this multi-node run. ║\n" - "║ ║\n" - "║ To enable multi-node profiling: ║\n" - "║ • Install rocprofiler-sdk package (ROCm >= 6.4.1) ║\n" - "║ • Command: apt install rocprofiler-sdk ║\n" - "║ • Or upgrade to ROCm 6.4.1 or later ║\n" - "║ ║\n" - "║ Note: Single-node profiling uses rocprof (no rocprofv3 required) ║\n" - "╚════════════════════════════════════════════════════════════════════════════╝" - ) - return { - "enabled": False, - "mode": "multi_node_unsupported", - "tools": [], - "per_node_collection": False - } - - # rocprofv3 is available - enable full multi-node profiling - logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") - - # Upgrade "rocprof" tools to "rocprofv3" for multi-node compatibility - upgraded_tools = [] - rocprof_upgraded = False - - for tool in tools_config: - tool_name = tool.get("name") - - if tool_name == "rocprof": - # Upgrade to rocprofv3 for multi-node MPI support - logger.info( - f" → Upgrading 'rocprof' to 'rocprofv3' for multi-node MPI compatibility" - ) - upgraded_tool = tool.copy() - upgraded_tool["name"] = "rocprofv3" - upgraded_tools.append(upgraded_tool) - rocprof_upgraded = True - else: - upgraded_tools.append(tool) - - # Log profiling tools being used - if upgraded_tools: - tool_names = [t.get("name") for t in upgraded_tools] - logger.info(f" → Multi-node profiling tools: {', '.join(tool_names)}") - - # Highlight RCCL trace if present (critical for multi-node communication) - if "rccl_trace" in tool_names: - logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") - - return { - "enabled": True, - "mode": "multi_node", - "tools": upgraded_tools, - "per_node_collection": True, - "profiler": "rocprofv3", - "wrapper_mode": "launcher" - } - - class KubernetesDeployment(BaseDeployment): """ Kubernetes cluster deployment using Python client library. @@ -271,11 +101,7 @@ def __init__(self, config: DeploymentConfig): "Install with: pip install pyyaml" ) - # Apply intelligent defaults using ConfigLoader - # This merges built-in presets with user configuration - full_config = ConfigLoader.load_k8s_config(config.additional_context) - config.additional_context = full_config - + apply_deployment_config(config, ConfigLoader.load_k8s_config) super().__init__(config) # Parse K8s configuration (now with defaults applied) @@ -288,11 +114,8 @@ def __init__(self, config: DeploymentConfig): # Setup Jinja2 template environment template_dir = Path(__file__).parent / "templates" / "kubernetes" - self.jinja_env = Environment(loader=FileSystemLoader(str(template_dir))) - - # Register custom Jinja2 filters - self.jinja_env.filters['dirname'] = lambda path: str(Path(path).parent) - + self.jinja_env = create_jinja_env(template_dir) + # Initialize data provider (will be used if models need data) self.data = None self.context_for_data = None @@ -454,7 +277,7 @@ def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) - return # Load tools.json to get pre/post script definitions - tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + tools_json_path = get_madengine_root() / "scripts" / "common" / "tools.json" if not tools_json_path.exists(): return @@ -492,7 +315,7 @@ def _load_common_scripts(self, script_list: List[Dict]) -> Dict[str, str]: """ import os script_contents = {} - madengine_root = Path(__file__).parent.parent # Go up to madengine/ directory + madengine_root = get_madengine_root() for script_config in script_list: script_path = script_config.get("path", "") @@ -784,7 +607,7 @@ def _prepare_template_context( data_provider_script = k8s_tools_config["data_providers"][provider_type] # Load K8s data provider script content - k8s_script_path = Path(__file__).parent.parent / data_provider_script["script"] + k8s_script_path = get_madengine_root() / data_provider_script["script"] if k8s_script_path.exists(): with open(k8s_script_path, "r") as f: data_provider_script_content = f.read() @@ -2855,7 +2678,9 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: }) # 2. Parse NODE-LOCAL performance from log - perf_data = self._parse_node_performance(log, model_info, build_info) + perf_data = self._parse_performance_from_log( + log, model_info.get("name", "") + ) # Get pod exit status pod_status = pod.status.phase @@ -2967,21 +2792,51 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) if aggregated_record: - # Write ONE aggregated row to perf.csv (CRITICAL for database) - self._write_to_perf_csv(aggregated_record) - + # Full reporting pipeline: perf_entry at project root, then update_* (same as local/SLURM) + self._ensure_perf_csv_exists() + run_details_dict = self._build_perf_entry_from_aggregated( + aggregated_record, model_info, build_info, deployment_id + ) + perf_entry_path = Path("perf_entry.json") + with open(perf_entry_path, "w", encoding="utf-8") as f: + json.dump(run_details_dict, f, indent=2) + if run_details_dict.get("status") == "SUCCESS": + update_perf_csv(perf_csv="perf.csv", single_result=str(perf_entry_path)) + else: + update_perf_csv(perf_csv="perf.csv", exception_result=str(perf_entry_path)) + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + try: + if run_details_dict.get("status") == "SUCCESS": + num_entries = update_perf_super_json( + single_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + else: + num_entries = update_perf_super_json( + exception_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries, + ) + except Exception as e: + self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") results["successful_runs"].append({ "model": model_info.get("name"), "perf_data": aggregated_record, - "nodes": results["nodes"], # Include per-node details - "per_node_metrics": per_node_metrics # For detailed analysis + "nodes": results["nodes"], + "per_node_metrics": per_node_metrics }) - self.console.print( f"[green]✓ Aggregated performance from {len(per_node_metrics)} nodes[/green]" ) self.console.print( - f"[green]✓ Updated local perf.csv[/green]" + f"[green]✓ Updated perf_entry.json, perf.csv, perf_super.* (Docker-compatible)[/green]" ) else: # No performance from log: try multiple_results CSV (same contract as local Docker) @@ -3012,7 +2867,7 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: model_name=model_info.get("name", ""), ) scripts_path = model_info.get("scripts", "") - scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + scripts_base_dir = scripts_base_dir_from(scripts_path) num_entries = update_perf_super_json( perf_super_json="perf_super.json", multiple_results=str(resolved_csv_path), @@ -3483,48 +3338,44 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s # Model configuration "training_precision": model_info.get("training_precision", ""), - "pipeline": os.environ.get("pipeline", ""), + "pipeline": get_pipeline(), "args": model_info.get("args", ""), "tags": model_info.get("tags", ""), - + # Build information "docker_file": build_info.get("dockerfile", ""), "base_docker": build_info.get("base_docker", ""), "docker_sha": build_info.get("docker_sha", ""), "docker_image": build_info.get("docker_image", ""), - + # Runtime information "git_commit": "", "machine_name": pod_name, "deployment_type": "kubernetes", "launcher": launcher, "gpu_architecture": "", - + # Performance metrics - FAILED "performance": "0", "metric": error_msg, # Store error message in metric field "relative_change": "", "status": "FAILURE", # Use "FAILURE" to match CSV schema - + # Timing "build_duration": build_info.get("build_duration", ""), "test_duration": "", - + # Data information "dataname": model_info.get("data", ""), "data_provider_type": "", "data_size": "", "data_download_duration": "", - + # Build tracking - "build_number": os.environ.get("BUILD_NUMBER", "0"), + "build_number": get_build_number(), "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), } - - # Flatten tags if they are in list format - if isinstance(result["tags"], list): - result["tags"] = ",".join(str(item) for item in result["tags"]) - + flatten_tags_in_place(result) return result # Standard perf.csv header (must match container_runner.ensure_perf_csv_exists) @@ -3543,6 +3394,67 @@ def _ensure_perf_csv_exists(self) -> None: perf_csv_path.write_text(self._PERF_CSV_HEADER + "\n", encoding="utf-8") self.console.print("[dim]Created perf.csv with standard header[/dim]") + def _build_perf_entry_from_aggregated( + self, + aggregated_record: Dict[str, Any], + model_info: Dict[str, Any], + build_info: Dict[str, Any], + deployment_id: str, + ) -> Dict[str, Any]: + """Build full run_details dict from aggregated record for perf_entry and update_* pipeline.""" + from madengine.utils.config_parser import ConfigParser + + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") + test_duration = aggregated_record.get("test_duration") or aggregated_record.get("duration", "") + run_details = { + "model": model_info.get("name", aggregated_record.get("model", "")), + "n_gpus": str(aggregated_record.get("n_gpus", nnodes * nproc_per_node)), + "nnodes": str(aggregated_record.get("nnodes", nnodes)), + "gpus_per_node": str(aggregated_record.get("gpus_per_node", nproc_per_node)), + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": "kubernetes", + "launcher": launcher, + "gpu_architecture": aggregated_record.get("gpu_architecture", ""), + "performance": str(aggregated_record.get("performance", "")), + "metric": aggregated_record.get("metric", ""), + "relative_change": "", + "status": aggregated_record.get("status", "SUCCESS"), + "build_duration": build_info.get("build_duration", ""), + "test_duration": test_duration, + "dataname": aggregated_record.get("data_name", model_info.get("data", "")), + "data_provider_type": aggregated_record.get("data_provider", ""), + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags_in_place(run_details) + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + run_details["configs"] = config_parser.parse_and_load( + model_info.get("args", ""), scripts_path + ) + except Exception: + run_details["configs"] = None + return run_details + def _build_common_info_dict( self, model_info: Dict, @@ -3571,7 +3483,7 @@ def _build_common_info_dict( "nnodes": nnodes_str, "gpus_per_node": gpus_per_node, "training_precision": model_info.get("training_precision", ""), - "pipeline": os.environ.get("pipeline", ""), + "pipeline": get_pipeline(), "args": model_info.get("args", ""), "tags": model_info.get("tags", ""), "docker_file": build_info.get("dockerfile", ""), @@ -3590,11 +3502,10 @@ def _build_common_info_dict( "data_provider_type": "", "data_size": "", "data_download_duration": "", - "build_number": os.environ.get("BUILD_NUMBER", "0"), + "build_number": get_build_number(), "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), } - if isinstance(result["tags"], list): - result["tags"] = ",".join(str(t) for t in result["tags"]) + flatten_tags_in_place(result) return result def _create_multiple_result_row_record( @@ -3625,7 +3536,7 @@ def _create_multiple_result_row_record( "nnodes": str(nnodes), "gpus_per_node": str(nproc_per_node), "training_precision": model_info.get("training_precision", ""), - "pipeline": os.environ.get("pipeline", ""), + "pipeline": get_pipeline(), "args": model_info.get("args", ""), "tags": model_info.get("tags", ""), "docker_file": build_info.get("dockerfile", ""), @@ -3647,82 +3558,12 @@ def _create_multiple_result_row_record( "data_provider_type": "", "data_size": "", "data_download_duration": "", - "build_number": os.environ.get("BUILD_NUMBER", "0"), + "build_number": get_build_number(), "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), } - if isinstance(result["tags"], list): - result["tags"] = ",".join(str(t) for t in result["tags"]) + flatten_tags_in_place(result) return result - def _parse_node_performance( - self, - log_content: str, - model_info: Dict, - build_info: Dict - ) -> Optional[Dict]: - """ - Parse node-local performance from log. - - Expected format in log (from updated run scripts): - performance: - node_id: - local_gpus: - - Args: - log_content: Pod log content - model_info: Model information dict - build_info: Build information dict - - Returns: - Dict with node performance data, or None if parsing failed - """ - import re - - perf_data = None - - # Parse performance line - perf_pattern = r"performance:\s*([\d.]+)\s+(\S+)" - match = re.search(perf_pattern, log_content) - - if match: - value = float(match.group(1)) - metric = match.group(2) - - # Try to extract node_id for validation - node_id_pattern = r"node_id:\s*(\d+)" - node_match = re.search(node_id_pattern, log_content) - node_id = int(node_match.group(1)) if node_match else None - - # Try to extract local_gpus - local_gpus_pattern = r"local_gpus:\s*(\d+)" - gpus_match = re.search(local_gpus_pattern, log_content) - local_gpus = int(gpus_match.group(1)) if gpus_match else 1 - - # Extract duration if available - duration_pattern = r"test_duration:\s*([\d.]+)s" - duration_match = re.search(duration_pattern, log_content) - duration = f"{duration_match.group(1)}s" if duration_match else "N/A" - - # Extract GPU architecture from rocEnvTool runtime detection - # Look for pattern: 🔹 Name : gfx942 or Name : gfx942 - gpu_arch_pattern = r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)" - gpu_arch_match = re.search(gpu_arch_pattern, log_content) - gpu_arch = gpu_arch_match.group(1) if gpu_arch_match else "N/A" - - perf_data = { - "model": model_info.get("name"), - "performance": value, - "metric": metric, - "node_id": node_id, - "local_gpus": local_gpus, - "duration": duration, - "gpu_architecture": gpu_arch, - "data_name": "N/A", - "data_provider": "N/A" - } - - return perf_data - def _parse_multiple_results_from_artifacts( self, results_dir: Path, @@ -3968,247 +3809,6 @@ def _resolve_multiple_results_csv( return merged_path return csv_paths[0] - def _determine_aggregation_method(self, metric_name: str) -> str: - """ - Determine how to aggregate a metric based on its name/type. - - Args: - metric_name: Name of the performance metric - - Returns: - "sum", "average", "max", or "unknown" - """ - metric_lower = metric_name.lower() - - # Throughput metrics - SUM - if any(keyword in metric_lower for keyword in [ - "throughput", "samples_per_second", "tokens_per_second", - "images_per_second", "requests_per_second", "qps", - "bandwidth", "ops_per_second", "samples/sec", "tokens/sec" - ]): - return "sum" - - # Latency metrics - AVERAGE - elif any(keyword in metric_lower for keyword in [ - "latency", "time", "duration", "milliseconds", "seconds", - "ttft", "tpot", "response_time" - ]): - return "average" - - # Accuracy metrics - AVERAGE - elif any(keyword in metric_lower for keyword in [ - "accuracy", "precision", "recall", "f1", "loss" - ]): - return "average" - - # Memory metrics - MAX - elif any(keyword in metric_lower for keyword in [ - "memory", "bytes", "ram", "vram", "gb", "mb" - ]): - return "max" - - else: - # Unknown - default to sum for throughput-like metrics (conservative) - self.console.print(f"[yellow]⚠ Unknown metric type '{metric_name}', using sum aggregation[/yellow]") - return "sum" - - def _aggregate_node_metrics( - self, - per_node_metrics: List[Dict], - nnodes: int, - launcher_type: str - ) -> Optional[Dict]: - """ - Aggregate per-node metrics into single job-level metric. - - Aggregation Strategy: - - Throughput (samples/sec, tokens/sec, images/sec): SUM - - Latency (ms, seconds): AVERAGE - - Accuracy (%, ratio): AVERAGE or LAST - - Memory (bytes, GB): MAX or SUM - - Args: - per_node_metrics: List of performance dicts from each node - nnodes: Number of nodes - launcher_type: Type of launcher (torchrun, deepspeed, etc.) - - Returns: - Dict with aggregated performance data for perf.csv - """ - import statistics - - if not per_node_metrics: - return None - - # Get metric type from first node - first_metric = per_node_metrics[0] - metric_name = first_metric["metric"] - - # Determine aggregation strategy based on metric type - aggregation_method = self._determine_aggregation_method(metric_name) - - if aggregation_method == "sum": - # Sum throughput metrics - aggregated_value = sum(m["performance"] for m in per_node_metrics) - method_desc = "sum_across_nodes" - elif aggregation_method == "average": - # Average latency/accuracy metrics - aggregated_value = statistics.mean(m["performance"] for m in per_node_metrics) - method_desc = "average_across_nodes" - elif aggregation_method == "max": - # Max for memory usage - aggregated_value = max(m["performance"] for m in per_node_metrics) - method_desc = "max_across_nodes" - else: - # Unknown - conservative sum - aggregated_value = sum(m["performance"] for m in per_node_metrics) - method_desc = "sum_across_nodes (default)" - - # Compute statistics for validation - perfs = [m["performance"] for m in per_node_metrics] - if len(perfs) > 1: - statistics_dict = { - "mean": statistics.mean(perfs), - "std_dev": statistics.stdev(perfs), - "min": min(perfs), - "max": max(perfs), - "coefficient_variation": statistics.stdev(perfs) / statistics.mean(perfs) if statistics.mean(perfs) > 0 else 0 - } - else: - statistics_dict = { - "mean": perfs[0], - "std_dev": 0, - "min": perfs[0], - "max": perfs[0], - "coefficient_variation": 0 - } - - # Get GPU architecture from any successful node - gpu_arch = "N/A" - for m in per_node_metrics: - if m.get("gpu_architecture") and m["gpu_architecture"] != "N/A": - gpu_arch = m["gpu_architecture"] - break - - # Get duration (use max across nodes - slowest determines job time) - durations = [m.get("duration", "N/A") for m in per_node_metrics if m.get("duration") != "N/A"] - if durations: - # Extract numeric value and find max - duration_values = [] - for d in durations: - if isinstance(d, str) and d.endswith("s"): - try: - duration_values.append(float(d[:-1])) - except ValueError: - pass - duration = f"{max(duration_values):.2f}s" if duration_values else "N/A" - else: - duration = "N/A" - - # Get total GPUs - total_gpus = sum(m.get("local_gpus", 1) for m in per_node_metrics) - gpus_per_node = per_node_metrics[0].get("local_gpus", 1) if per_node_metrics else 1 - - # Build aggregated record (matches perf.csv schema) - aggregated_record = { - "model": first_metric["model"], - "performance": aggregated_value, - "metric": metric_name, - "status": "SUCCESS", - "topology": f"{nnodes}N×{gpus_per_node}G", - "nnodes": nnodes, - "launcher": launcher_type or "N/A", - "deployment_type": "kubernetes", - "gpu_architecture": gpu_arch, - "test_duration": duration, # FIXED: Must match CSV header name - "data_name": first_metric.get("data_name", "N/A"), - "data_provider": first_metric.get("data_provider", "N/A"), - - # NEW: Aggregation metadata (for results_summary.json) - "aggregation_method": method_desc, - "nodes_contributing": len(per_node_metrics), - "per_node_mean": statistics_dict["mean"], - "per_node_std_dev": statistics_dict["std_dev"], - "per_node_cv": statistics_dict["coefficient_variation"] - } - - return aggregated_record - - def _write_to_perf_csv(self, perf_data: Dict): - """ - Write performance data to local perf.csv file. - - Uses the same format as local execution for consistency. - Matches the schema from container_runner.py's create_run_details_dict(). - - When appending to an existing file, uses the file's existing header so that - column order matches (e.g. MAD-private uses a different schema with model/performance - in different positions). Values are mapped by column name so model, performance, - metric, status etc. always land in the correct columns. - """ - import csv - from pathlib import Path - - perf_csv_path = Path("perf.csv") - - # CSV headers for NEW files (madengine standard order) - default_headers = [ - "model", - "n_gpus", - "nnodes", - "gpus_per_node", - "training_precision", - "pipeline", - "args", - "tags", - "docker_file", - "base_docker", - "docker_sha", - "docker_image", - "git_commit", - "machine_name", - "deployment_type", - "launcher", - "gpu_architecture", - "performance", - "metric", - "relative_change", - "status", - "build_duration", - "test_duration", - "dataname", - "data_provider_type", - "data_size", - "data_download_duration", - "build_number", - "additional_docker_run_options", - ] - - file_exists = perf_csv_path.exists() - existing_header = None - if file_exists: - # Read existing header so we write in the same column order (fixes MAD-private - # and other repos that use a different perf.csv schema with more/different columns) - with open(perf_csv_path, "r", newline="", encoding="utf-8", errors="replace") as rf: - reader = csv.reader(rf) - existing_header = next(reader, None) - headers = existing_header if existing_header else default_headers - if file_exists and existing_header: - # Build row by name so model/performance/metric/status go to correct columns - row_by_name = {k: perf_data.get(k, "") for k in headers} - row_to_write = [str(row_by_name.get(h, "")) for h in headers] - else: - row_to_write = perf_data - - with open(perf_csv_path, "a", newline="", encoding="utf-8") as f: - writer = csv.DictWriter(f, fieldnames=headers, extrasaction="ignore") - if not file_exists: - writer.writeheader() - if file_exists and existing_header: - csv.writer(f).writerow(row_to_write) - else: - writer.writerow(row_to_write) - def cleanup(self, deployment_id: str) -> bool: """Delete Job, ConfigMap, Service and associated pods.""" success = True diff --git a/src/madengine/deployment/presets/slurm/defaults.json b/src/madengine/deployment/presets/slurm/defaults.json index cf4ba978..aa98b06f 100644 --- a/src/madengine/deployment/presets/slurm/defaults.json +++ b/src/madengine/deployment/presets/slurm/defaults.json @@ -13,7 +13,7 @@ "nodes": 1, "gpus_per_node": 8, "time": "24:00:00", - "output_dir": "./slurm_output", + "output_dir": "./slurm_results", "exclusive": true, "modules": [] }, diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 8a4a1f7b..7849e911 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -14,193 +14,16 @@ import os import subprocess from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List, Optional -from jinja2 import Environment, FileSystemLoader - -from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus -from .config_loader import ConfigLoader +from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env +from .common import configure_multi_node_profiling, normalize_launcher +from .config_loader import ConfigLoader, apply_deployment_config from .slurm_node_selector import SlurmNodeSelector from madengine.utils.gpu_config import resolve_runtime_gpus -from typing import Optional - - -# Valid distributed launchers -VALID_LAUNCHERS = [ - "torchrun", - "torchtitan", - "deepspeed", - "megatron-lm", - "vllm", - "sglang", - "sglang-disagg" -] - - -def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> str: - """ - Normalize launcher field based on deployment type and launcher value. - - Logic: - - If launcher is in VALID_LAUNCHERS: keep as-is - - If launcher is None/empty/invalid: - * local → "docker" (runs in Docker container) - * slurm → "docker" (typically uses containers on compute nodes) - * kubernetes → "native" (pod itself is the container) - - Args: - launcher_type: Raw launcher type from config (may be None) - deployment_type: "local", "slurm", or "kubernetes" - - Returns: - Normalized launcher string - """ - # If launcher is valid, keep it - if launcher_type and launcher_type in VALID_LAUNCHERS: - return launcher_type - - # Otherwise, default based on deployment type - if deployment_type == "local": - return "docker" - elif deployment_type == "slurm": - return "docker" - elif deployment_type == "kubernetes": - return "native" - else: - # Fallback for unknown deployment types - return "docker" - - -def is_rocprofv3_available() -> bool: - """ - Check if rocprofv3 is available on the system. - - rocprofv3 is required for multi-node profiling with MPI support. - It's part of rocprofiler-sdk package in ROCm >= 6.4.1. - - Returns: - True if rocprofv3 is available and executable, False otherwise - """ - try: - # Note: rocprofv3 doesn't support --version, use --help instead - result = subprocess.run( - ["rocprofv3", "--help"], - capture_output=True, - text=True, - timeout=5 - ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired, OSError): - return False - - -def configure_multi_node_profiling( - nnodes: int, - tools_config: list, - logger -) -> Dict[str, Any]: - """ - Configure profiling for multi-node SLURM runs with rocprofv3 support. - - Industry best practice for multi-node profiling: - - Profile ALL nodes to detect stragglers, load imbalances, and communication bottlenecks - - Use rocprofv3 (MPI-aware) for distributed profiling - - Collect per-node outputs for detailed analysis - - Logic: - 1. Single node (nnodes == 1): Use existing tool behavior - 2. Multi-node (nnodes > 1): - a. Check if rocprofv3 is available - b. If available: Enable per-node profiling, upgrade "rocprof" to "rocprofv3" - c. If not available: Log warning and skip profiling - - Args: - nnodes: Number of nodes in the SLURM deployment - tools_config: List of tool configurations from user - logger: Logger instance for messages - - Returns: - Dictionary with profiling configuration: - - enabled: bool - Whether profiling is enabled - - mode: str - "single_node", "multi_node", or "multi_node_unsupported" - - tools: list - Processed tool configurations - - per_node_collection: bool - Whether to collect from all nodes - """ - if nnodes == 1: - # Single node - existing behavior works fine - return { - "enabled": True, - "mode": "single_node", - "tools": tools_config, - "per_node_collection": False - } - - # Multi-node case - check rocprofv3 availability - if not is_rocprofv3_available(): - logger.warning( - "╔════════════════════════════════════════════════════════════════════════════╗\n" - "║ Multi-Node Profiling Requirements Not Met ║\n" - "╠════════════════════════════════════════════════════════════════════════════╣\n" - "║ Multi-node profiling requires rocprofv3 (MPI-aware profiling support). ║\n" - "║ ║\n" - "║ Current Status: rocprofv3 NOT FOUND on system ║\n" - "║ ║\n" - "║ Profiling will be SKIPPED for this multi-node run. ║\n" - "║ ║\n" - "║ To enable multi-node profiling: ║\n" - "║ • Install rocprofiler-sdk package (ROCm >= 6.4.1) ║\n" - "║ • Command: apt install rocprofiler-sdk ║\n" - "║ • Or upgrade to ROCm 6.4.1 or later ║\n" - "║ ║\n" - "║ Note: Single-node profiling uses rocprof (no rocprofv3 required) ║\n" - "╚════════════════════════════════════════════════════════════════════════════╝" - ) - return { - "enabled": False, - "mode": "multi_node_unsupported", - "tools": [], - "per_node_collection": False - } - - # rocprofv3 is available - enable full multi-node profiling - logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") - - # Upgrade "rocprof" tools to "rocprofv3" for multi-node compatibility - upgraded_tools = [] - rocprof_upgraded = False - - for tool in tools_config: - tool_name = tool.get("name") if isinstance(tool, dict) else None - - if tool_name == "rocprof": - # Upgrade to rocprofv3 for multi-node MPI support - logger.info( - f" → Upgrading 'rocprof' to 'rocprofv3' for multi-node MPI compatibility" - ) - upgraded_tool = tool.copy() if isinstance(tool, dict) else {"name": "rocprofv3"} - upgraded_tool["name"] = "rocprofv3" - upgraded_tools.append(upgraded_tool) - rocprof_upgraded = True - else: - upgraded_tools.append(tool) - - # Log profiling tools being used - if upgraded_tools: - tool_names = [t.get("name") if isinstance(t, dict) else str(t) for t in upgraded_tools] - logger.info(f" → Multi-node profiling tools: {', '.join(filter(None, tool_names))}") - - # Highlight RCCL trace if present (critical for multi-node communication) - if "rccl_trace" in tool_names: - logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") - - return { - "enabled": True, - "mode": "multi_node", - "tools": upgraded_tools, - "per_node_collection": True, - "profiler": "rocprofv3", - "wrapper_mode": "launcher" - } +from madengine.utils.run_details import get_build_number, get_pipeline +from madengine.utils.path_utils import scripts_base_dir_from +import json class SlurmDeployment(BaseDeployment): @@ -232,11 +55,7 @@ def __init__(self, config: DeploymentConfig): Args: config: Deployment configuration """ - # Apply intelligent defaults using ConfigLoader - # This merges built-in presets with user configuration - full_config = ConfigLoader.load_slurm_config(config.additional_context) - config.additional_context = full_config - + apply_deployment_config(config, ConfigLoader.load_slurm_config) super().__init__(config) # Parse SLURM configuration (now with defaults applied) @@ -248,15 +67,11 @@ def __init__(self, config: DeploymentConfig): self.nodes = self.slurm_config.get("nodes", 1) self.gpus_per_node = self.slurm_config.get("gpus_per_node", 8) self.time_limit = self.slurm_config.get("time", "24:00:00") - self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_output")) + self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_results")) # Setup Jinja2 template engine template_dir = Path(__file__).parent / "templates" / "slurm" - self.jinja_env = Environment(loader=FileSystemLoader(str(template_dir))) - - # Register custom Jinja2 filters - self.jinja_env.filters['dirname'] = lambda path: str(Path(path).parent) - self.jinja_env.filters['basename'] = lambda path: str(Path(path).name) + self.jinja_env = create_jinja_env(template_dir) # Generated script path self.script_path = None @@ -487,6 +302,7 @@ def debug(self, msg): "timeout": self.config.timeout, "live_output": self.config.additional_context.get("live_output", False), "tags": " ".join(model_info.get("tags", [])), + "multiple_results": model_info.get("multiple_results"), "credential_file": "credential.json" if Path("credential.json").exists() else None, @@ -852,42 +668,69 @@ def deploy(self) -> DeploymentResult: ) # ==================== PREFLIGHT NODE SELECTION ==================== - # For multi-node jobs with Ray/vLLM, check for clean nodes first - # to avoid OOM errors from stale processes + # For single- and multi-node jobs, check for clean nodes and exclude bad ones. + # Single-node: we still run the check so bad nodes (e.g. Docker broken) get excluded; + # we never gate submission for nodes==1 so behavior stays backward compatible. + # Health-check srun invocations create SLURM jobs; we cancel them after preflight. enable_preflight = self.slurm_config.get("enable_node_check", True) auto_cleanup = self.slurm_config.get("auto_cleanup_nodes", False) - - if enable_preflight and self.nodes > 1 and not self.slurm_config.get("nodelist"): + allow_submit_without_clean = self.slurm_config.get("allow_submit_without_clean_nodes", False) + clean_nodes: List[str] = [] + health_check_job_name: Optional[str] = None + + if enable_preflight and self.nodes >= 1 and not self.slurm_config.get("nodelist"): try: selector = SlurmNodeSelector( console=self.console, auto_cleanup=auto_cleanup, verbose=self.slurm_config.get("verbose_node_check", False), ) - - # Select clean nodes and get updated exclude list clean_nodes, updated_exclude = selector.select_nodes( partition=self.partition, nodes_needed=self.nodes, exclude=self.slurm_config.get("exclude"), constraint=self.slurm_config.get("constraint"), ) - - # Update exclude list if dirty nodes found + health_check_job_name = getattr(selector, "_health_check_job_name", None) + + # Update exclude list if we found dirty/unreachable/unknown nodes if updated_exclude and updated_exclude != self.slurm_config.get("exclude", ""): self.console.print( f"[dim]Updated exclude list for sbatch: {updated_exclude}[/dim]\n" ) - # Re-generate script with updated exclude list self.slurm_config["exclude"] = updated_exclude - self.prepare() # Re-generate sbatch script - + self.prepare() + + # Gate: do not submit if not enough clean nodes (multi-node only; single-node always allowed) + if ( + self.nodes > 1 + and not allow_submit_without_clean + and len(clean_nodes) < self.nodes + ): + SlurmNodeSelector.cancel_health_check_jobs(health_check_job_name, self.console) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=( + f"Not enough clean nodes: need {self.nodes}, found {len(clean_nodes)}. " + "Set slurm.allow_submit_without_clean_nodes=true to submit anyway." + ), + ) + + # When we have enough clean nodes, pin the job to them via nodelist + if len(clean_nodes) >= self.nodes: + nodelist_str = ",".join(clean_nodes[: self.nodes]) + self.slurm_config["nodelist"] = nodelist_str + self.console.print(f"[dim]Using nodelist: {nodelist_str}[/dim]\n") + self.prepare() except Exception as e: - # Don't fail deployment if preflight fails self.console.print( f"[yellow]⚠ Node health check failed: {e}[/yellow]" ) self.console.print("[dim]Continuing with job submission[/dim]\n") + finally: + # Always cancel health-check jobs so they do not stay in the queue + SlurmNodeSelector.cancel_health_check_jobs(health_check_job_name, self.console) # ==================== END PREFLIGHT ==================== try: @@ -1004,7 +847,7 @@ def _stream_job_output(self, job_id: str, final: bool = False): self._output_positions = {} # Find output file - output_dir = self.slurm_config.get("output_dir", "./slurm_output") + output_dir = str(self.output_dir) output_pattern = f"{output_dir}/madengine-*_{job_id}_*.out" try: @@ -1045,7 +888,7 @@ def _stream_job_output(self, job_id: str, final: bool = False): def _show_log_summary(self, job_id: str, success: bool = True): """Show a summary with pointers to log files instead of streaming verbose output.""" - output_dir = self.slurm_config.get("output_dir", "./slurm_output") + output_dir = str(self.output_dir) try: import glob @@ -1143,23 +986,128 @@ def _check_job_completion(self, job_id: str) -> DeploymentResult: message=f"Job {job_id} completed (status unavailable)", ) + def _build_perf_entry_from_aggregated( + self, + aggregated_record: Dict[str, Any], + model_info: Dict[str, Any], + build_info: Dict[str, Any], + deployment_id: str, + ) -> Dict[str, Any]: + """ + Build a full run-details dict (same shape as container_runner create_run_details_dict) + from an aggregated multi-node record or single-node parsed record, for use with + update_perf_csv and update_perf_super_*. + """ + from madengine.reporting.update_perf_csv import flatten_tags + from madengine.utils.config_parser import ConfigParser + + launcher_type = self.distributed_config.get("launcher", "torchrun") + launcher = normalize_launcher(launcher_type, "slurm") + + run_details = { + "model": model_info.get("name", aggregated_record.get("model", "")), + "n_gpus": str(aggregated_record.get("n_gpus", self.nodes * self.gpus_per_node)), + "nnodes": str(aggregated_record.get("nnodes", self.nodes)), + "gpus_per_node": str(aggregated_record.get("gpus_per_node", self.gpus_per_node)), + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": self.DEPLOYMENT_TYPE, + "launcher": launcher, + "gpu_architecture": aggregated_record.get("gpu_architecture", ""), + "performance": str(aggregated_record.get("performance", "")), + "metric": aggregated_record.get("metric", ""), + "relative_change": "", + "status": aggregated_record.get("status", "SUCCESS"), + "build_duration": build_info.get("build_duration", ""), + "test_duration": aggregated_record.get("test_duration", ""), + "dataname": aggregated_record.get("data_name", ""), + "data_provider_type": aggregated_record.get("data_provider", ""), + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags(run_details) + + # Configs for perf_super (optional) + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + run_details["configs"] = config_parser.parse_and_load( + model_info.get("args", ""), scripts_path + ) + except Exception: + run_details["configs"] = None + + return run_details + + def _build_common_info_dict( + self, + model_info: Dict[str, Any], + build_info: Dict[str, Any], + deployment_id: str, + gpu_architecture: str = "", + ) -> Dict[str, Any]: + """Build common_info dict for update_perf_csv/update_perf_super (multiple_results path).""" + from madengine.reporting.update_perf_csv import flatten_tags + + launcher_type = self.distributed_config.get("launcher", "torchrun") + launcher = normalize_launcher(launcher_type, "slurm") + total_gpus = self.nodes * self.gpus_per_node + result = { + "n_gpus": str(total_gpus), + "nnodes": str(self.nodes), + "gpus_per_node": str(self.gpus_per_node), + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": self.DEPLOYMENT_TYPE, + "launcher": launcher, + "gpu_architecture": gpu_architecture, + "relative_change": "", + "build_duration": build_info.get("build_duration", ""), + "test_duration": "", + "dataname": model_info.get("data", ""), + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags(result) + return result + def collect_results(self, deployment_id: str) -> Dict[str, Any]: """Collect performance results from SLURM output files. - - NOTE: Current implementation works with single-node jobs where perf.csv - is written to shared storage. For multi-node jobs with per-node metrics, - this would need enhancement to: - 1. Read all node output files (madengine-*_jobid_noderank.out) - 2. Parse per-node metrics from each file - 3. Aggregate using _aggregate_node_metrics() (similar to kubernetes.py) - 4. Write aggregated result to perf.csv - - Args: - deployment_id: SLURM job ID + + Option (1): slurm_results holds only collected inputs; login node builds one run + record and runs the same reporting pipeline as local (perf_entry -> update_perf_csv + / update_perf_super_*) so project root has a single cumulative perf for both local + and distributed runs. """ - # Get session_start_row from config (passed from orchestrator) + from madengine.reporting.update_perf_csv import update_perf_csv + from madengine.reporting.update_perf_super import ( + update_perf_super_csv, + update_perf_super_json, + ) + session_start_row = self.config.additional_context.get("session_start_row") - results = { "job_id": deployment_id, "nodes": self.nodes, @@ -1168,86 +1116,368 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: "logs": [], "successful_runs": [], "failed_runs": [], - "session_start_row": session_start_row, # Track for downstream filtering + "session_start_row": session_start_row, } - try: - # Find output files - output_pattern = f"madengine-*_{deployment_id}_*.out" - output_files = list(self.output_dir.glob(output_pattern)) - - results["logs"] = [str(f) for f in output_files] - - # Find performance CSV files - # Strategy 1: Check results_dir if configured - if self.slurm_config.get("results_dir"): - results_dir = Path(self.slurm_config["results_dir"]) - perf_pattern = f"perf_{deployment_id}_*.csv" - perf_files = list(results_dir.glob(perf_pattern)) - results["perf_files"] = [str(f) for f in perf_files] - - # Strategy 2: Check shared workspace (NFS) for perf.csv - # When using shared storage, perf.csv is written directly to workspace - if not results["perf_files"]: + model_keys = list(self.manifest.get("built_models") or {}) + model_key = model_keys[0] if model_keys else None + # Use logical model name for job_dir so it matches the task script (which uses model_info["name"]). + # built_models is keyed by image name; value has "name" = logical model name. + built_models_dict = self.manifest.get("built_models") or {} + model_info_for_path = built_models_dict.get(model_key, {}) if model_key else {} + model_name_for_path = model_info_for_path.get("name", model_key or "unknown") + model_name = model_key or "unknown" # image key for build_info / model_info_for_entry lookups + + build_info = {} + built_images = self.manifest.get("built_images") or {} + if built_images: + # First image or one keyed by model key (image name) + if model_name in built_images: + build_info = built_images[model_name] + else: + build_info = next(iter(built_images.values()), {}) + + job_dir = self.output_dir / model_name_for_path / deployment_id + job_dir.mkdir(parents=True, exist_ok=True) + + # Gather log content per node: from job_dir/node_N/ (new) or flat output_dir .out files + per_node_log_contents: List[tuple] = [] + flat_out_files = sorted(self.output_dir.glob(f"madengine-*_{deployment_id}_*.out")) + # Multi-node: only use explicit node logs (_node_N.out) to avoid also picking up + # SBATCH %t output (madengine-*__0.out, _1.out), which would duplicate metrics. + if self.nodes > 1: + flat_out_files = [f for f in flat_out_files if "_node_" in f.name] + results["logs"] = [str(f) for f in flat_out_files] + + for i, out_path in enumerate(flat_out_files): + content = out_path.read_text(encoding="utf-8", errors="ignore") + per_node_log_contents.append((i, content)) + + # If we have node subdirs (multi-node job script wrote them), prefer stdout.out there + for node_dir in sorted(job_dir.glob("node_*")): + stdout_path = node_dir / "stdout.out" + if stdout_path.exists(): + try: + idx = int(node_dir.name.replace("node_", "")) + if idx >= self.nodes: + continue # ignore stale node_N dirs from previous runs + content = stdout_path.read_text(encoding="utf-8", errors="ignore") + # Replace or append for this index + per_node_log_contents = [ + (n, c) for n, c in per_node_log_contents if n != idx + ] + per_node_log_contents.append((idx, content)) + per_node_log_contents.sort(key=lambda x: x[0]) + except (ValueError, OSError): + pass + + # Multi-node: keep only log entries for actual node indices [0, nodes-1] + if self.nodes > 1: + per_node_log_contents = [(n, c) for n, c in per_node_log_contents if n < self.nodes] + + # Copy flat logs into job_dir/node_/ for consistency if not already there. + # Only create dirs for indices in [0, nodes-1] so we never create extra node_2, etc. + for idx, content in per_node_log_contents: + if idx >= self.nodes: + continue + node_subdir = job_dir / f"node_{idx}" + node_subdir.mkdir(parents=True, exist_ok=True) + if not (node_subdir / "stdout.out").exists(): + (node_subdir / "stdout.out").write_text(content) + + # Parse performance from each node's log + all_parsed: List[Dict[str, Any]] = [] + for idx, content in sorted(per_node_log_contents, key=lambda x: x[0]): + perf_data = self._parse_performance_from_log(content, model_name) + if perf_data: + all_parsed.append(perf_data) + + # Deduplicate by node_id so each node is only counted once (avoids double-counting + # when multiple log sources exist for the same node, e.g. SBATCH vs node_*.out). + per_node_metrics: List[Dict[str, Any]] = [] + if self.nodes > 1 and all_parsed: + seen_node_ids: set = set() + for m in all_parsed: + nid = m.get("node_id") + if nid is not None and nid in seen_node_ids: + continue + if nid is not None: + seen_node_ids.add(nid) + per_node_metrics.append(m) + self.console.print( + f"[dim] Parsed node: {m.get('performance')} " + f"{m.get('metric', '')} (node_id={m.get('node_id')})[/dim]" + ) + else: + per_node_metrics = all_parsed + for m in per_node_metrics: + self.console.print( + f"[dim] Parsed node: {m.get('performance')} " + f"{m.get('metric', '')} (node_id={m.get('node_id')})[/dim]" + ) + + run_details_dict: Optional[Dict[str, Any]] = None + model_info_for_entry = (self.manifest.get("built_models") or {}).get( + model_key, {} + ) if model_key else {} + + # Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory + mult_res = model_info_for_entry.get("multiple_results") + if mult_res: + resolved_csv: Optional[Path] = None + if (job_dir / mult_res).is_file(): + resolved_csv = job_dir / mult_res + else: + for i in range(self.nodes): + candidate = job_dir / f"node_{i}" / mult_res + if candidate.is_file(): + resolved_csv = candidate + break + if not resolved_csv and Path(mult_res).is_file(): + resolved_csv = Path(mult_res) + if not resolved_csv and Path("run_directory", mult_res).is_file(): + resolved_csv = Path("run_directory", mult_res) + if resolved_csv: + self._ensure_perf_csv_exists() + gpu_arch = "" + if per_node_metrics: + gpu_arch = per_node_metrics[0].get("gpu_architecture", "") or "" + common_info = self._build_common_info_dict( + model_info_for_entry, build_info, deployment_id, gpu_arch + ) + common_info_path = Path("common_info.json") + with open(common_info_path, "w", encoding="utf-8") as f: + json.dump(common_info, f, indent=2) + update_perf_csv( + perf_csv="perf.csv", + multiple_results=str(resolved_csv), + common_info=str(common_info_path), + model_name=model_info_for_entry.get("name", model_name), + ) + scripts_path = model_info_for_entry.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + num_entries = update_perf_super_json( + perf_super_json="perf_super.json", + multiple_results=str(resolved_csv), + common_info=str(common_info_path), + model_name=model_info_for_entry.get("name", model_name), + scripts_base_dir=scripts_base_dir, + ) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries, + ) + results["perf_files"] = [str(Path("perf.csv").resolve())] + import csv as _csv + try: + with open(resolved_csv, "r", encoding="utf-8", errors="ignore") as f: + reader = _csv.DictReader(f) + for row in reader: + row = {k.strip(): v for k, v in row.items() if k} + if row.get("performance") and row.get("metric"): + results["successful_runs"].append({ + "model": model_info_for_entry.get("name", "") + "_" + row.get("model", ""), + "status": "SUCCESS", + "performance": str(row.get("performance", "")), + "metric": row.get("metric", ""), + "duration": row.get("test_duration", ""), + "gpu_arch": gpu_arch, + "deployment": "slurm", + "machine": deployment_id, + }) + except Exception: + pass + self.console.print( + f"[green]✓ Updated perf.csv, perf_super.* from multiple_results (Docker-compatible)[/green]" + ) + return results + # multiple_results set but CSV not found: fall through to single-result path (may write FAILURE) + + if self.nodes > 1 and per_node_metrics: + launcher_type = self.distributed_config.get("launcher", "torchrun") + aggregated = self._aggregate_node_metrics( + per_node_metrics, self.nodes, launcher_type + ) + if aggregated and model_info_for_entry: + run_details_dict = self._build_perf_entry_from_aggregated( + aggregated, model_info_for_entry, build_info, deployment_id + ) + self.console.print( + f"[green]✓ Aggregated from {len(per_node_metrics)} nodes " + f"({aggregated.get('aggregation_method', '')}): " + f"{aggregated.get('performance')} {aggregated.get('metric', '')}[/green]" + ) + elif self.nodes == 1 and per_node_metrics and model_info_for_entry: + single = per_node_metrics[0] + single_record = { + "model": single.get("model", model_name), + "n_gpus": self.gpus_per_node, + "nnodes": 1, + "gpus_per_node": self.gpus_per_node, + "performance": single.get("performance"), + "metric": single.get("metric", ""), + "status": "SUCCESS", + "test_duration": single.get("test_duration", ""), + "gpu_architecture": single.get("gpu_architecture", ""), + "data_name": single.get("data_name", ""), + "data_provider": single.get("data_provider", ""), + } + run_details_dict = self._build_perf_entry_from_aggregated( + single_record, model_info_for_entry, build_info, deployment_id + ) + elif self.nodes == 1: + # Single-node but no parsed metric (log parse failed or run failed before metrics). + # In-job run skips perf write (skip_perf_collection); write a FAILURE row so run appears in perf. + if model_info_for_entry: + single_record = { + "model": model_info_for_entry.get("name", model_name), + "n_gpus": self.gpus_per_node, + "nnodes": 1, + "gpus_per_node": self.gpus_per_node, + "performance": "", + "metric": "", + "status": "FAILURE", + "test_duration": "", + "gpu_architecture": "", + "data_name": "", + "data_provider": "", + } + run_details_dict = self._build_perf_entry_from_aggregated( + single_record, model_info_for_entry, build_info, deployment_id + ) + else: workspace_perf = Path("perf.csv") if workspace_perf.exists(): results["perf_files"] = [str(workspace_perf)] - self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]") - - # Parse perf.csv to populate successful_runs and failed_runs - # Filter based on session_start_row passed as parameter (no external files!) - if results["perf_files"]: - perf_file = Path(results["perf_files"][0]) - try: - import csv - - with open(perf_file, 'r') as f: - reader = csv.DictReader(f) - rows = list(reader) - - # Filter to only include rows from current session if session_start_row provided - if session_start_row is not None and session_start_row < len(rows): - rows = rows[session_start_row:] - self.console.print(f"[cyan]📊 Filtered to current session: {len(rows)} runs (from row {session_start_row} of {len(rows) + session_start_row} total)[/cyan]") - elif session_start_row is not None: - # Session start equals or exceeds current rows - no new runs yet - self.console.print(f"[yellow]⚠️ No new runs in this session (session started at row {session_start_row}, CSV has {len(rows)} rows)[/yellow]") - rows = [] - else: - # No session info provided - show all rows (for backward compatibility) - self.console.print(f"[dim]Showing all {len(rows)} runs from perf.csv (no session filtering)[/dim]") - - for row in rows: - run_data = { - "model": row.get("model", ""), - "status": row.get("status", ""), - "performance": row.get("performance", ""), - "metric": row.get("metric", ""), - "duration": row.get("test_duration", ""), - "gpu_arch": row.get("gpu_architecture", ""), - "deployment": row.get("deployment_type", ""), - "machine": row.get("machine_name", ""), - } - - if row.get("status") == "SUCCESS": - results["successful_runs"].append(run_data) - else: - results["failed_runs"].append(run_data) - except Exception as parse_error: - import traceback - self.console.print(f"[red]ERROR parsing perf.csv: {parse_error}[/red]") - self.console.print(f"[dim]{traceback.format_exc()}[/dim]") + self.console.print( + f"[green]✓ Collected results: {len(results['perf_files'])} perf files, " + f"{len(results['logs'])} log files[/green]" + ) + self._collect_results_parse_perf_csv(results, session_start_row) + return results + else: + # Multi-node but no metrics parsed - optional failure record + if per_node_metrics and model_info_for_entry: + launcher_type = self.distributed_config.get("launcher", "torchrun") + aggregated = self._aggregate_node_metrics( + per_node_metrics, self.nodes, launcher_type + ) + if aggregated: + aggregated["status"] = "FAILURE" + run_details_dict = self._build_perf_entry_from_aggregated( + aggregated, model_info_for_entry, build_info, deployment_id + ) - self.console.print( - f"[green]✓ Collected results: {len(results['perf_files'])} perf files, " - f"{len(results['logs'])} log files[/green]" + if run_details_dict is not None: + perf_entry_path = Path("perf_entry.json") + with open(perf_entry_path, "w") as f: + json.dump(run_details_dict, f, indent=2) + perf_csv_path = "perf.csv" + self._ensure_perf_csv_exists() + if run_details_dict.get("status") == "SUCCESS": + update_perf_csv(perf_csv=perf_csv_path, single_result=str(perf_entry_path)) + else: + update_perf_csv(perf_csv=perf_csv_path, exception_result=str(perf_entry_path)) + try: + scripts_path = model_info_for_entry.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + if run_details_dict.get("status") == "SUCCESS": + num_entries = update_perf_super_json( + single_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + else: + num_entries = update_perf_super_json( + exception_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries, + ) + except Exception as e: + self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") + results["perf_files"] = [str(Path(perf_csv_path).resolve())] + run_data = { + "model": run_details_dict.get("model", ""), + "status": run_details_dict.get("status", ""), + "performance": str(run_details_dict.get("performance", "")), + "metric": run_details_dict.get("metric", ""), + "duration": run_details_dict.get("test_duration", ""), + "gpu_arch": run_details_dict.get("gpu_architecture", ""), + "deployment": run_details_dict.get("deployment_type", ""), + "machine": run_details_dict.get("machine_name", ""), + } + if run_details_dict.get("status") == "SUCCESS": + results["successful_runs"].append(run_data) + else: + results["failed_runs"].append(run_data) + summary = { + "job_id": deployment_id, + "model": model_name, + "nodes": self.nodes, + "per_node_metrics": per_node_metrics, + "final_metric": run_details_dict.get("metric", ""), + "final_performance": str(run_details_dict.get("performance", "")), + "perf_entry_path": str(perf_entry_path), + } + (job_dir / "results_summary.json").write_text( + json.dumps(summary, indent=2), encoding="utf-8" ) - except Exception as e: - self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") - + if not results["perf_files"]: + workspace_perf = Path("perf.csv") + if workspace_perf.exists(): + results["perf_files"] = [str(workspace_perf)] + # When we already appended the current run from run_details_dict, skip re-parsing + # the whole perf.csv so Execution Results shows only the current run. + if run_details_dict is None: + self._collect_results_parse_perf_csv(results, session_start_row) + self.console.print( + f"[green]✓ Collected results: {len(results['perf_files'])} perf files, " + f"{len(results['logs'])} log files[/green]" + ) return results + def _collect_results_parse_perf_csv( + self, results: Dict[str, Any], session_start_row: Optional[int] + ) -> None: + """Parse perf.csv to populate results['successful_runs'] and results['failed_runs'].""" + if not results.get("perf_files"): + return + import csv + + perf_file = Path(results["perf_files"][0]) + try: + with open(perf_file, "r") as f: + reader = csv.DictReader(f) + rows = list(reader) + if session_start_row is not None and session_start_row < len(rows): + rows = rows[session_start_row:] + elif session_start_row is not None and session_start_row >= len(rows): + rows = [] + for row in rows: + run_data = { + "model": row.get("model", ""), + "status": row.get("status", ""), + "performance": row.get("performance", ""), + "metric": row.get("metric", ""), + "duration": row.get("test_duration", ""), + "gpu_arch": row.get("gpu_architecture", ""), + "deployment": row.get("deployment_type", ""), + "machine": row.get("machine_name", ""), + } + if row.get("status") == "SUCCESS": + results["successful_runs"].append(run_data) + else: + results["failed_runs"].append(run_data) + except Exception as e: + self.console.print(f"[yellow]⚠ Could not parse perf.csv: {e}[/yellow]") + def cleanup(self, deployment_id: str) -> bool: """Cancel SLURM job if still running (locally).""" try: diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py index b52f53d7..408e8d3c 100644 --- a/src/madengine/deployment/slurm_node_selector.py +++ b/src/madengine/deployment/slurm_node_selector.py @@ -10,6 +10,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ +import os import subprocess import time from dataclasses import dataclass @@ -86,6 +87,9 @@ def __init__( self.verbose = verbose self.timeout = timeout + # Max candidates to check (avoids excessive checks on large clusters) + MAX_CANDIDATES_CAP = 100 + def get_candidate_nodes( self, partition: str, @@ -94,16 +98,16 @@ def get_candidate_nodes( constraint: Optional[str] = None, ) -> Optional[List[str]]: """ - Query SLURM for candidate nodes in partition. - + Query SLURM for idle candidate nodes in partition. + Args: partition: SLURM partition name - count: Number of nodes needed + count: Number of nodes needed (used for optional cap) exclude: Comma-separated nodes to exclude constraint: SLURM constraint filter - + Returns: - List of candidate node names (2x count for redundancy) + List of idle node names (all idle, up to MAX_CANDIDATES_CAP) """ cmd = [ "sinfo", @@ -111,12 +115,12 @@ def get_candidate_nodes( "-N", # Node-oriented format "-h", # No header "-o", "%N", # Node name only - "-t", "idle,alloc,mix", # Available states + "-t", "idle", # Idle nodes only ] - + if constraint: cmd.extend(["-C", constraint]) - + try: result = subprocess.run( cmd, @@ -124,32 +128,30 @@ def get_candidate_nodes( text=True, timeout=10, ) - + if result.returncode != 0: if self.verbose: self.console.print( f"[yellow]⚠ sinfo failed: {result.stderr}[/yellow]" ) return None - + # Parse nodes all_nodes = set() for line in result.stdout.strip().split('\n'): line = line.strip() if line: - # Handle node ranges like "node[01-04]" all_nodes.add(line) - + # Remove excluded nodes if exclude: excluded = set(exclude.split(',')) all_nodes -= excluded - - # Return 2x count for redundancy (check more nodes than needed) - candidates = sorted(list(all_nodes))[:(count * 2)] - + + # Return all idle nodes, capped to avoid excessive checks + candidates = sorted(list(all_nodes))[: self.MAX_CANDIDATES_CAP] return candidates - + except subprocess.TimeoutExpired: self.console.print("[yellow]⚠ sinfo timed out[/yellow]") return None @@ -158,16 +160,17 @@ def get_candidate_nodes( self.console.print(f"[yellow]⚠ Query failed: {e}[/yellow]") return None - def check_node_health(self, node: str) -> NodeStatus: + def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeStatus: """ Check GPU health on a node using srun. - + Uses srun to execute GPU check on the node without SSH. Checks for stale Ray/vLLM processes and GPU memory usage. - + Args: node: Node name to check - + job_name: Optional SLURM job name for this srun (enables cleanup of health-check jobs) + Returns: NodeStatus with health information """ @@ -196,19 +199,21 @@ def check_node_health(self, node: str) -> NodeStatus: ps aux | grep -E "(ray::|RayWorkerWrapper|raylet|vllm)" | grep -v grep || echo "NO_PROCESSES" echo "===END_PROCESSES===" """ - + srun_cmd = [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", # Allow overlap with running jobs + "--quiet", + ] + if job_name: + srun_cmd.append(f"--job-name={job_name}") + srun_cmd.extend(["bash", "-c", check_script]) + try: - # Use srun to execute check on specific node result = subprocess.run( - [ - "srun", - f"--nodelist={node}", - "--ntasks=1", - "--time=00:01:00", - "--overlap", # Allow overlap with running jobs - "--quiet", - "bash", "-c", check_script - ], + srun_cmd, capture_output=True, text=True, timeout=self.timeout, @@ -279,13 +284,14 @@ def check_node_health(self, node: str) -> NodeStatus: error_message=str(e)[:100], ) - def cleanup_node(self, node: str) -> bool: + def cleanup_node(self, node: str, job_name: Optional[str] = None) -> bool: """ Clean up stale processes on a node using srun. - + Args: node: Node name to clean - + job_name: Optional SLURM job name for this srun (enables cleanup of health-check jobs) + Returns: True if cleanup successful """ @@ -307,18 +313,21 @@ def cleanup_node(self, node: str) -> bool: echo "CLEANUP_OK" """ - + srun_cmd = [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", + "--quiet", + ] + if job_name: + srun_cmd.append(f"--job-name={job_name}") + srun_cmd.extend(["bash", "-c", cleanup_script]) + try: result = subprocess.run( - [ - "srun", - f"--nodelist={node}", - "--ntasks=1", - "--time=00:01:00", - "--overlap", - "--quiet", - "bash", "-c", cleanup_script - ], + srun_cmd, capture_output=True, text=True, timeout=self.timeout, @@ -345,16 +354,16 @@ def select_nodes( ) -> Tuple[List[str], str]: """ Select clean nodes for SLURM job. - - This is the main entry point. Checks candidate nodes and returns - a list of clean nodes plus an updated exclude list. - + + Checks idle nodes on-demand and stops as soon as enough clean nodes + are found. Excludes dirty, unreachable, and unknown nodes from allocation. + Args: partition: SLURM partition name nodes_needed: Number of nodes required for job exclude: Current exclude list (comma-separated) constraint: SLURM constraint filter - + Returns: Tuple of (clean_nodes, updated_exclude_list) - clean_nodes: List of clean node names (may be empty) @@ -365,28 +374,31 @@ def select_nodes( f"Partition: [cyan]{partition}[/cyan] | " f"Nodes needed: [cyan]{nodes_needed}[/cyan]\n" ) - - # Get candidate nodes + + # Unique job name for all health-check srun invocations (enables cleanup) + self._health_check_job_name = f"madengine_nodecheck_{os.getpid()}_{int(time.time())}" + + # Get all idle candidate nodes candidates = self.get_candidate_nodes(partition, nodes_needed, exclude, constraint) - + if not candidates: self.console.print( "[yellow]⚠ Cannot query candidate nodes, skipping preflight check[/yellow]\n" ) + self._health_check_job_name = None return [], exclude or "" - + if self.verbose: - self.console.print(f"[dim]Checking {len(candidates)} candidate nodes...[/dim]\n") - - # Check health of each candidate - statuses = [] + self.console.print(f"[dim]Idle candidates: {len(candidates)} (checking on-demand until {nodes_needed} clean)[/dim]\n") + + # On-demand check: stop as soon as we have enough clean nodes + statuses: List[NodeStatus] = [] + clean_nodes: List[str] = [] for node in candidates: if self.verbose: self.console.print(f" Checking {node}...", end="") - - status = self.check_node_health(node) + status = self.check_node_health(node, job_name=self._health_check_job_name) statuses.append(status) - if self.verbose: emoji = { NodeHealth.CLEAN: "✓", @@ -395,56 +407,61 @@ def select_nodes( NodeHealth.UNKNOWN: "?", }[status.health] self.console.print(f" {emoji}") - - # Display summary table + if status.health == NodeHealth.CLEAN: + clean_nodes.append(node) + if len(clean_nodes) >= nodes_needed: + break + + # Display summary table (only nodes we checked) self._display_status_table(statuses) - - # Identify dirty nodes + + # Nodes to exclude: DIRTY, UNREACHABLE, and UNKNOWN dirty_nodes = [s for s in statuses if s.health == NodeHealth.DIRTY] - clean_nodes = [s.node for s in statuses if s.health == NodeHealth.CLEAN] - - # Handle dirty nodes + unreachable_nodes = [s for s in statuses if s.health == NodeHealth.UNREACHABLE] + unknown_nodes = [s for s in statuses if s.health == NodeHealth.UNKNOWN] + nodes_to_exclude = set() + nodes_to_exclude.update(s.node for s in dirty_nodes) + nodes_to_exclude.update(s.node for s in unreachable_nodes) + nodes_to_exclude.update(s.node for s in unknown_nodes) + + # Handle dirty nodes (optional auto-cleanup) if dirty_nodes: self.console.print( f"\n[yellow]⚠ Found {len(dirty_nodes)} dirty node(s) " f"with stale Ray/vLLM processes[/yellow]" ) - if self.auto_cleanup: self.console.print("[yellow]Running automatic cleanup...[/yellow]\n") - for status in dirty_nodes: self.console.print(f" Cleaning {status.node}...") - if self.cleanup_node(status.node): - # Re-check after cleanup + if self.cleanup_node(status.node, job_name=self._health_check_job_name): time.sleep(2) - new_status = self.check_node_health(status.node) + new_status = self.check_node_health(status.node, job_name=self._health_check_job_name) if new_status.health == NodeHealth.CLEAN: clean_nodes.append(new_status.node) + nodes_to_exclude.discard(status.node) self.console.print(f" [green]✓ {status.node} is now clean[/green]") else: self.console.print(f" [red]✗ {status.node} still dirty[/red]") else: self.console.print(f" [red]✗ Cleanup failed[/red]") - - # Update dirty nodes list - dirty_nodes = [s for s in statuses - if s.health == NodeHealth.DIRTY and s.node not in clean_nodes] - - # Build updated exclude list - dirty_node_names = [s.node for s in dirty_nodes] - existing_exclude = set(exclude.split(',')) if exclude else set() - existing_exclude.update(dirty_node_names) - updated_exclude = ','.join(sorted(existing_exclude)) - - if dirty_node_names: - self.console.print( - f"\n[yellow]Adding dirty nodes to exclude list: " - f"{', '.join(dirty_node_names)}[/yellow]" - ) - else: - updated_exclude = exclude or "" - + + # Build updated exclude list (dirty + unreachable + unknown) + existing_exclude = set(exclude.split(',')) if exclude else set() + existing_exclude.update(nodes_to_exclude) + updated_exclude = ','.join(sorted(existing_exclude)) + + if unreachable_nodes or unknown_nodes: + bad = [s.node for s in unreachable_nodes] + [s.node for s in unknown_nodes] + self.console.print( + f"\n[yellow]Excluding unreachable/unknown nodes: {', '.join(bad)}[/yellow]" + ) + if dirty_nodes and not self.auto_cleanup: + self.console.print( + f"\n[yellow]Adding dirty nodes to exclude list: " + f"{', '.join(s.node for s in dirty_nodes)}[/yellow]" + ) + # Final summary if len(clean_nodes) >= nodes_needed: self.console.print( @@ -464,7 +481,7 @@ def select_nodes( self.console.print( "[yellow]Recommendation: Wait for nodes to be cleaned or run manual cleanup[/yellow]\n" ) - + return clean_nodes, updated_exclude def _extract_section(self, text: str, start_marker: str, end_marker: str) -> str: @@ -516,3 +533,41 @@ def _display_status_table(self, statuses: List[NodeStatus]): self.console.print(table) self.console.print() + @staticmethod + def cancel_health_check_jobs(job_name: Optional[str], console: Optional[Console] = None) -> None: + """ + Cancel any SLURM jobs created by the node health check (srun invocations). + + Call this after select_nodes() so pending health-check jobs do not stay in the queue. + + Args: + job_name: Job name used for health-check srun (e.g. selector._health_check_job_name) + console: Optional Rich console for messages + """ + if not job_name: + return + _console = console or Console() + try: + user = os.environ.get("USER", "") + if not user: + return + result = subprocess.run( + ["squeue", "-u", user, "-n", job_name, "-h", "-o", "%i"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0 or not result.stdout.strip(): + return + job_ids = result.stdout.strip().split() + for jid in job_ids: + if jid.isdigit(): + subprocess.run( + ["scancel", jid], + capture_output=True, + timeout=5, + ) + if job_ids and _console: + _console.print(f"[dim]Cancelled {len(job_ids)} health-check job(s)[/dim]") + except Exception: + pass diff --git a/src/madengine/deployment/templates/slurm/job.sh.j2 b/src/madengine/deployment/templates/slurm/job.sh.j2 index 06ee4c64..46a59b20 100644 --- a/src/madengine/deployment/templates/slurm/job.sh.j2 +++ b/src/madengine/deployment/templates/slurm/job.sh.j2 @@ -275,6 +275,11 @@ if 'deployment_config' in manifest: if gpus_per_node: manifest['deployment_config']['gpus_per_node'] = gpus_per_node +# SLURM (single- and multi-node): skip in-job perf write; collect_results will add one record +if manifest.get('context') is None: + manifest['context'] = {} +manifest['context']['skip_perf_collection'] = True + with open(output_file, 'w') as f: json.dump(manifest, f, indent=2) print('Created Docker execution manifest for SLURM compute node') @@ -352,6 +357,10 @@ fi # ============================================================================= # Multi-node: Execute with per-task setup # ============================================================================= +# PyTorch elastic rendezvous: allow time for all nodes to finish image pull/startup. +# Default ~900s is too short when each node does a fresh pull; 3600s avoids "Timed out waiting for clients". +export TORCH_ELASTIC_RDZV_TIMEOUT=3600 + # For multi-node distributed execution: # 1. Each srun task runs on a separate node with unique SLURM_PROCID # 2. All nodes participate in workload via launcher (torchrun/vLLM/SGLang/etc.) @@ -369,7 +378,7 @@ fi # Use submission directory (shared filesystem) for task script # /tmp is local to each node and won't be accessible by srun on other nodes -TASK_SCRIPT="{{ manifest_file | dirname }}/slurm_output/madengine_task_${SLURM_JOB_ID}.sh" +TASK_SCRIPT="{{ manifest_file | dirname }}/{{ output_dir }}/madengine_task_${SLURM_JOB_ID}.sh" cat > "$TASK_SCRIPT" << 'TASK_SCRIPT_EOF' #!/bin/bash @@ -453,13 +462,22 @@ rsync -a --quiet \ --exclude='.venv' \ --exclude='env' \ --exclude='.env' \ - --exclude='slurm_output/*.out' \ - --exclude='slurm_output/*.log' \ + --exclude='{{ output_dir }}/*.out' \ + --exclude='{{ output_dir }}/*.log' \ "$SUBMISSION_DIR/" "$WORKSPACE/" echo " ✓ Project copied to local workspace" echo "" +# ============================================================================= +# Debug: set node log paths early and trap EXIT so we always see which node failed +# ============================================================================= +RESULTS_DIR="$SUBMISSION_DIR" +NODE_LOG_OUT="${RESULTS_DIR}/{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.out" +NODE_LOG_ERR="${RESULTS_DIR}/{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.err" +echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)): task script started" >> "${NODE_LOG_ERR}" +trap 'ec=$?; echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)): task script exiting with code $ec" >> "${NODE_LOG_ERR}"' EXIT + # ============================================================================= # Verify madengine Availability # ============================================================================= @@ -536,6 +554,11 @@ if 'deployment_config' in manifest: if gpus_per_node: manifest['deployment_config']['gpus_per_node'] = gpus_per_node +# SLURM (single- and multi-node): skip in-job perf write; collect_results will add one record +if manifest.get('context') is None: + manifest['context'] = {} +manifest['context']['skip_perf_collection'] = True + with open(output_file, 'w') as f: json.dump(manifest, f, indent=2) print('✓ Created Docker execution manifest for SLURM compute node') @@ -604,8 +627,8 @@ echo "" # Create node-specific log files in results directory RESULTS_DIR={{ manifest_file | dirname }} -NODE_LOG_OUT="${RESULTS_DIR}/slurm_output/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.out" -NODE_LOG_ERR="${RESULTS_DIR}/slurm_output/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.err" +NODE_LOG_OUT="${RESULTS_DIR}/{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.out" +NODE_LOG_ERR="${RESULTS_DIR}/{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.err" echo "Node ${SLURM_PROCID} logs:" echo " stdout: ${NODE_LOG_OUT}" @@ -619,85 +642,65 @@ else export MAD_SKIP_PERF_COLLECTION="false" fi +# Debug: log immediately before madengine so we know which node reached this point +echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)): about to run madengine" >> "${NODE_LOG_ERR}" + # Run madengine with output redirected to node-specific log files +# Use append (2>>) for stderr so debug lines written above are not overwritten # Environment variables (MASTER_ADDR, MAD_MULTI_NODE_RUNNER, etc.) are inherited $MAD_CLI_COMMAND run \ --manifest-file "$EXEC_MANIFEST" \ --timeout {{ timeout | default(3600) }} \ {% if shared_data %}--force-mirror-local {{ shared_data }}{% endif %} \ {% if live_output %}--live-output{% endif %} \ - > "${NODE_LOG_OUT}" 2> "${NODE_LOG_ERR}" + > "${NODE_LOG_OUT}" 2>> "${NODE_LOG_ERR}" TASK_EXIT=$? +echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)): madengine exited with code $TASK_EXIT" >> "${NODE_LOG_ERR}" echo "" echo "Task completed with exit code: $TASK_EXIT" # ============================================================================= -# Multi-Node Result Collection (Best Practice: Master Node Only) +# Multi-Node Per-Node Artifact Collection (Option 1: login node builds perf record) # ============================================================================= -# For distributed workloads, only the master node (SLURM_PROCID=0) should -# collect and report performance metrics to avoid: -# - Duplicate data in perf.csv -# - Race conditions in metric extraction -# - Failures from non-master nodes trying to report identical global metrics -# -# This follows distributed execution best practices where only rank 0 -# reports final metrics. +# Every node copies its workspace artifacts to submission dir for collect_results. +# Do NOT copy perf.csv/perf_entry.json/perf_super.* from nodes; login node will +# parse logs, aggregate, and run update_perf_csv/update_perf_super at project root. # ============================================================================= +SUBMISSION_DIR={{ manifest_file | dirname }} +JOB_COLLECTION_DIR="${SUBMISSION_DIR}/{{ output_dir }}/{{ model_name }}/${SLURM_JOB_ID}" +NODE_COLLECTION_DIR="${JOB_COLLECTION_DIR}/node_${SLURM_PROCID}" +mkdir -p "$NODE_COLLECTION_DIR" + if [ $TASK_EXIT -eq 0 ]; then - if [ "${SLURM_PROCID}" = "0" ]; then - # Master node: Collect and report results - RESULTS_DIR={{ manifest_file | dirname }} - echo "" - echo "========================================================================" - echo "Master Node (SLURM_PROCID=0): Collecting results" - echo "========================================================================" - echo "Copying results back to: $RESULTS_DIR" - - # Copy performance results (main metric file) - if [ -f "$WORKSPACE/perf.csv" ]; then - cp "$WORKSPACE/perf.csv" "$RESULTS_DIR/perf.csv" 2>/dev/null || true - echo " ✓ Copied: perf.csv (global metrics)" - fi - - # Copy log files - for log in "$WORKSPACE"/*.log; do - if [ -f "$log" ]; then - log_basename=$(basename "$log") - cp "$log" "$RESULTS_DIR/${log_basename}" 2>/dev/null || true - echo " ✓ Copied: ${log_basename}" - fi - done - - # Copy any workload results files - if [ -f "$WORKSPACE/results.txt" ]; then - cp "$WORKSPACE/results.txt" "$RESULTS_DIR/" 2>/dev/null || true - echo " ✓ Copied: results.txt" - fi - # Legacy support for training_results.txt - if [ -f "$WORKSPACE/training_results.txt" ]; then - cp "$WORKSPACE/training_results.txt" "$RESULTS_DIR/" 2>/dev/null || true - echo " ✓ Copied: training_results.txt" - fi - - echo " ✓ Master node results collection complete" - echo "========================================================================" - else - # Worker nodes: Exit cleanly without collecting results - echo "" - echo "========================================================================" - echo "Worker Node (SLURM_PROCID=${SLURM_PROCID}): Exiting cleanly" - echo "========================================================================" - echo " Note: Performance metrics collected by master node only (best practice)" - echo "========================================================================" - fi + echo "" + echo "========================================================================" + echo "Node ${SLURM_PROCID}: Copying artifacts to ${NODE_COLLECTION_DIR}" + echo "========================================================================" + # Copy node stdout/stderr into node dir for collect_results + if [ -f "${NODE_LOG_OUT}" ]; then cp "${NODE_LOG_OUT}" "$NODE_COLLECTION_DIR/stdout.out" 2>/dev/null || true; fi + if [ -f "${NODE_LOG_ERR}" ]; then cp "${NODE_LOG_ERR}" "$NODE_COLLECTION_DIR/stderr.err" 2>/dev/null || true; fi + # Copy env, logs, profiling, multi_results, etc. (not perf.csv/perf_entry/perf_super) + for f in "$WORKSPACE"/*_env.csv "$WORKSPACE"/*.log "$WORKSPACE"/multi_results*.csv "$WORKSPACE"/results* "$WORKSPACE"/gpu_info_*.csv "$WORKSPACE"/library_trace.csv; do + if [ -f "$f" ]; then cp "$f" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi + done + {% if multiple_results %} + if [ -f "$WORKSPACE/run_directory/{{ multiple_results }}" ]; then cp "$WORKSPACE/run_directory/{{ multiple_results }}" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi + if [ -f "$WORKSPACE/{{ multiple_results }}" ]; then cp "$WORKSPACE/{{ multiple_results }}" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi + {% endif %} + if [ -d "$WORKSPACE/rocprof_output" ]; then cp -r "$WORKSPACE/rocprof_output" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi + echo " ✓ Node ${SLURM_PROCID} artifacts copied" + echo "========================================================================" else echo "" echo "========================================================================" echo "Task FAILED with exit code: $TASK_EXIT" echo " Node: SLURM_PROCID=${SLURM_PROCID}" echo "========================================================================" + # Copy node logs so collect_results can still inspect them + if [ -f "${NODE_LOG_OUT}" ]; then cp "${NODE_LOG_OUT}" "$NODE_COLLECTION_DIR/stdout.out" 2>/dev/null || true; fi + if [ -f "${NODE_LOG_ERR}" ]; then cp "${NODE_LOG_ERR}" "$NODE_COLLECTION_DIR/stderr.err" 2>/dev/null || true; fi fi exit $TASK_EXIT @@ -741,17 +744,27 @@ $MAD_CLI_COMMAND run \ {% if live_output %}--live-output{% endif %} EXIT_CODE=$? + +# Single-node: copy multiple_results CSV to job collection dir so collect_results finds it +{% if multiple_results %} +if [ $EXIT_CODE -eq 0 ]; then + SUBMISSION_DIR={{ manifest_file | dirname }} + JOB_COLLECTION_DIR="${SUBMISSION_DIR}/{{ output_dir }}/{{ model_name }}/${SLURM_JOB_ID}" + NODE_COLLECTION_DIR="${JOB_COLLECTION_DIR}/node_0" + mkdir -p "$NODE_COLLECTION_DIR" + if [ -f "$WORKSPACE/run_directory/{{ multiple_results }}" ]; then cp "$WORKSPACE/run_directory/{{ multiple_results }}" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi + if [ -f "$WORKSPACE/{{ multiple_results }}" ]; then cp "$WORKSPACE/{{ multiple_results }}" "$NODE_COLLECTION_DIR/" 2>/dev/null || true; fi +fi +{% endif %} {% endif %} # ============================================================================= # Job Completion # ============================================================================= -# Note: For multi-node jobs, only the master node (SLURM_PROCID=0) collects -# and reports performance metrics. This follows distributed execution best -# practices where: -# - Global metrics are identical across all nodes (computed via all_reduce) -# - Only rank 0 should report to avoid duplicate/conflicting data -# - Worker nodes exit cleanly after workload completes +# Note: For multi-node jobs, each node copies its artifacts to slurm_results/ +# //node_/. The login node runs collect_results to parse +# logs, aggregate metrics, and update project-root perf.csv/perf_super via +# the same reporting pipeline as local runs. echo "" if [ $EXIT_CODE -eq 0 ]; then diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index c3299049..0bc53335 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -27,6 +27,24 @@ from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv from madengine.utils.gpu_config import resolve_runtime_gpus from madengine.utils.config_parser import ConfigParser +from madengine.utils.path_utils import scripts_base_dir_from +from madengine.utils.run_details import get_build_number, get_pipeline +from madengine.execution.container_runner_helpers import ( + make_run_log_file_path, + resolve_run_timeout, +) + + +def _resolve_multiple_results_path(multiple_results: str, model_dir: str) -> typing.Optional[str]: + """Resolve multiple_results CSV path: try cwd then model_dir. Return first that exists.""" + if not multiple_results: + return None + if os.path.isfile(multiple_results): + return multiple_results + path_in_model_dir = os.path.join(model_dir, multiple_results) + if os.path.isfile(path_in_model_dir): + return path_in_model_dir + return None class ContainerRunner: @@ -202,7 +220,7 @@ def create_run_details_dict( "nnodes": nnodes, "gpus_per_node": gpus_per_node, "training_precision": model_info.get("training_precision", ""), - "pipeline": os.environ.get("pipeline", ""), + "pipeline": get_pipeline(), "args": model_info.get("args", ""), "tags": model_info.get("tags", ""), "docker_file": build_info.get("dockerfile", ""), @@ -228,7 +246,7 @@ def create_run_details_dict( "data_provider_type": run_results.get("data_provider_type", ""), "data_size": run_results.get("data_size", ""), "data_download_duration": run_results.get("data_download_duration", ""), - "build_number": os.environ.get("BUILD_NUMBER", "0"), + "build_number": get_build_number(), "additional_docker_run_options": model_info.get( "additional_docker_run_options", "" ), @@ -240,7 +258,7 @@ def create_run_details_dict( # Parse and load config file if present in args for perf_entry_super.json try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + scripts_base_dir = scripts_base_dir_from(scripts_path) config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) run_details["configs"] = config_parser.parse_and_load( model_info.get("args", ""), @@ -654,34 +672,8 @@ def run_container( """ self.rich_console.print(f"[bold green]🏃 Running model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan] [dim]in container[/dim] [yellow]{docker_image}[/yellow]") - # Apply timeout logic: model timeout can override default timeout - # If model has a timeout in models.json and CLI timeout is default (7200), use model's timeout - # If CLI timeout is explicitly set (not default), it overrides model timeout - if "timeout" in model_info and model_info["timeout"] is not None and model_info["timeout"] > 0 and timeout == 7200: - # Model has a timeout and CLI is using default, so use model's timeout - timeout = model_info["timeout"] - - # Create log file for this run - # Extract dockerfile part from docker image name (remove "ci-" prefix and model name prefix) - image_name_without_ci = docker_image.replace("ci-", "") - model_name_clean = model_info["name"].replace("/", "_").lower() - - # Remove model name from the beginning to get the dockerfile part - if image_name_without_ci.startswith(model_name_clean + "_"): - dockerfile_part = image_name_without_ci[len(model_name_clean + "_") :] - else: - dockerfile_part = image_name_without_ci - - log_file_path = ( - model_info["name"].replace("/", "_") - + "_" - + dockerfile_part - + phase_suffix - + ".live.log" - ) - # Replace / with _ in log file path (already done above, but keeping for safety) - log_file_path = log_file_path.replace("/", "_") - + timeout = resolve_run_timeout(model_info, timeout) + log_file_path = make_run_log_file_path(model_info, docker_image, phase_suffix) print(f"Run log will be written to: {log_file_path}") # get machine name @@ -746,7 +738,7 @@ def run_container( # Add environment variables docker_options += f"--env MAD_MODEL_NAME='{model_info['name']}' " docker_options += ( - f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " + f"--env JENKINS_BUILD_NUMBER='{get_build_number()}' " ) # Gather data and environment @@ -782,6 +774,7 @@ def run_container( 'NNODES', 'NPROC_PER_NODE', 'MAD_MULTI_NODE_RUNNER', 'MAD_COLLECT_METRICS', 'NCCL_SOCKET_IFNAME', 'GLOO_SOCKET_IFNAME', 'NCCL_DEBUG', 'NCCL_IB_DISABLE', 'NCCL_NET_GDR_LEVEL', + 'TORCH_ELASTIC_RDZV_TIMEOUT', # Rendezvous timeout so all nodes can join after pull # GPU visibility variables for Ray-based launchers (vLLM, SGLang) # CRITICAL: These must be passed to Docker for proper GPU device mapping 'HIP_VISIBLE_DEVICES', 'ROCR_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES' @@ -1071,11 +1064,14 @@ def run_container( f"cd {model_dir} && {script_name} {model_args}", timeout=timeout, ) - # Print output to ensure it gets captured in log file - print(model_output) + # When live_output is True, Console.sh() already streamed the output; avoid duplicate print. + if not self.live_output: + print(model_output) run_results["test_duration"] = time.time() - test_start_time print(f"Test Duration: {run_results['test_duration']} seconds") + # Parser-friendly line for SLURM log collection (test_duration: Xs) + print(f"test_duration: {run_results['test_duration']:.2f}s") # Run post-scripts if pre_encapsulate_post_scripts["post_scripts"]: @@ -1092,33 +1088,43 @@ def run_container( multiple_results = model_info.get("multiple_results", None) if multiple_results: - run_results["performance"] = multiple_results - # Validate multiple results file format using proper CSV parsing - try: - import csv - with open(multiple_results, "r") as f: - csv_reader = csv.DictReader(f) - - # Check if 'performance' column exists - if 'performance' not in csv_reader.fieldnames: - print("Error: 'performance' column not found in multiple results file.") - run_results["performance"] = None - else: - # Check if at least one row has a non-empty performance value - has_valid_perf = False - for row in csv_reader: - if row.get('performance', '').strip(): - has_valid_perf = True - break - - if not has_valid_perf: - run_results["performance"] = None - print("Error: Performance metric is empty in all rows of multiple results file.") - except Exception as e: + resolved_path = _resolve_multiple_results_path( + multiple_results, model_dir + ) + if not resolved_path: self.rich_console.print( - f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]" + f"[yellow]Warning: Could not find multiple results file " + f"(tried cwd and {model_dir}/): {multiple_results}[/yellow]" ) run_results["performance"] = None + else: + run_results["performance"] = resolved_path + # Validate multiple results file format using proper CSV parsing + try: + import csv + with open(resolved_path, "r") as f: + csv_reader = csv.DictReader(f) + + # Check if 'performance' column exists + if 'performance' not in csv_reader.fieldnames: + print("Error: 'performance' column not found in multiple results file.") + run_results["performance"] = None + else: + # Check if at least one row has a non-empty performance value + has_valid_perf = False + for row in csv_reader: + if row.get('performance', '').strip(): + has_valid_perf = True + break + + if not has_valid_perf: + run_results["performance"] = None + print("Error: Performance metric is empty in all rows of multiple results file.") + except Exception as e: + self.rich_console.print( + f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]" + ) + run_results["performance"] = None else: # Match the actual output format: "performance: 14164 samples_per_second" # Simple pattern to capture number and metric unit @@ -1326,8 +1332,13 @@ def run_container( # Handle multiple results if specified multiple_results = model_info.get("multiple_results", None) + resolved_multiple_results = ( + _resolve_multiple_results_path(multiple_results, model_dir) + if multiple_results + else None + ) if ( - multiple_results + resolved_multiple_results and run_results.get("status") == "SUCCESS" ): # Generate common info JSON for multiple results @@ -1341,7 +1352,7 @@ def run_container( # Update perf.csv with multiple results update_perf_csv( - multiple_results=multiple_results, + multiple_results=resolved_multiple_results, perf_csv=self.perf_csv_path, model_name=run_details_dict["model"], common_info="common_info.json", @@ -1353,11 +1364,11 @@ def run_container( # Update perf_super.json with multiple results try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + scripts_base_dir = scripts_base_dir_from(scripts_path) # Reuse common_info.json for super files (no need for duplicate) num_entries = update_perf_super_json( - multiple_results=multiple_results, + multiple_results=resolved_multiple_results, perf_super_json="perf_super.json", model_name=run_details_dict["model"], common_info="common_info.json", @@ -1395,7 +1406,7 @@ def run_container( # Update perf_super.json with single result try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + scripts_base_dir = scripts_base_dir_from(scripts_path) # Use perf_entry.json as input (already created above) if run_results.get("status") == "SUCCESS": @@ -1434,6 +1445,15 @@ def run_container( # Ignore errors if no profiler/trace output files exist pass + # Copy multiple_results CSV to workspace root before run_directory is removed + # so SLURM single-node copy can find it at $WORKSPACE/{{ multiple_results }} + mult_res = model_info.get("multiple_results") + if mult_res: + try: + model_docker.sh(f"cp {model_dir}/{mult_res} . 2>/dev/null || true") + except Exception: + pass + # Cleanup if not keeping alive and not keeping model directory if not keep_alive and not keep_model_dir: model_docker.sh(f"rm -rf {model_dir}", timeout=240) @@ -1480,7 +1500,7 @@ def run_container( # Update perf_super.json with exception result try: scripts_path = model_info.get("scripts", "") - scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + scripts_base_dir = scripts_base_dir_from(scripts_path) # Use perf_entry.json as input (already created above) num_entries = update_perf_super_json( @@ -1545,7 +1565,10 @@ def run_models_from_manifest( # Load deployment_config from manifest for GPU resolution if "deployment_config" in manifest and not self.additional_context: self.additional_context = {"deployment_config": manifest["deployment_config"]} - + # Merge manifest context (e.g. skip_perf_collection for multi-node SLURM aggregation) + if "context" in manifest and isinstance(manifest["context"], dict): + self.additional_context = {**(self.additional_context or {}), **manifest["context"]} + if not built_images: self.rich_console.print("[yellow]⚠️ No images found in manifest[/yellow]") return {"successful_runs": [], "failed_runs": []} diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py new file mode 100644 index 00000000..c6f7a25f --- /dev/null +++ b/src/madengine/execution/container_runner_helpers.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +Pure helpers for container run flow (log paths, timeout resolution). + +Extracted so run_container logic is easier to test and maintain. +""" + +import typing + + +def resolve_run_timeout( + model_info: typing.Dict, + cli_timeout: int, + default_cli_timeout: int = 7200, +) -> int: + """ + Resolve effective run timeout from model config and CLI. + + - If model has a timeout and CLI is using default (7200), use model's timeout. + - If CLI timeout is explicitly set (not default), it overrides model timeout. + + Args: + model_info: Model info dict; may have "timeout" key. + cli_timeout: Timeout from CLI. + default_cli_timeout: Value considered "default" for CLI (typically 7200). + + Returns: + Effective timeout in seconds. + """ + if ( + "timeout" in model_info + and model_info["timeout"] is not None + and model_info["timeout"] > 0 + and cli_timeout == default_cli_timeout + ): + return model_info["timeout"] + return cli_timeout + + +def make_run_log_file_path( + model_info: typing.Dict, + docker_image: str, + phase_suffix: str = "", +) -> str: + """ + Build the log file path for a container run. + + Derives dockerfile part from image name (strip ci- and model prefix), + then: {model_safe}_{dockerfile_part}{phase_suffix}.live.log + + Args: + model_info: Must have "name" key. + docker_image: Full docker image name (e.g. ci-model_ubuntu.22.04). + phase_suffix: Optional suffix (e.g. ".run"). + + Returns: + Log file path string with "/" replaced by "_". + """ + image_name_without_ci = docker_image.replace("ci-", "") + model_name_clean = model_info["name"].replace("/", "_").lower() + + if image_name_without_ci.startswith(model_name_clean + "_"): + dockerfile_part = image_name_without_ci[len(model_name_clean + "_") :] + else: + dockerfile_part = image_name_without_ci + + log_file_path = ( + model_info["name"].replace("/", "_") + + "_" + + dockerfile_part + + phase_suffix + + ".live.log" + ) + return log_file_path.replace("/", "_") diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py index 3901d864..6727ffda 100644 --- a/src/madengine/execution/docker_builder.py +++ b/src/madengine/execution/docker_builder.py @@ -17,20 +17,17 @@ from madengine.core.console import Console from madengine.core.context import Context from madengine.utils.ops import PythonicTee +from madengine.execution.dockerfile_utils import ( + is_compilation_arch_compatible, + is_target_arch_compatible_with_variable, + parse_dockerfile_gpu_variables, + parse_gpu_variable_value, +) class DockerBuilder: """Class responsible for building Docker images for models.""" - # GPU architecture variables used in MAD/DLM Dockerfiles - GPU_ARCH_VARIABLES = [ - "MAD_SYSTEM_GPU_ARCHITECTURE", - "PYTORCH_ROCM_ARCH", - "GPU_TARGETS", - "GFX_COMPILATION_ARCH", - "GPU_ARCHS" - ] - def __init__( self, context: Context, console: Console = None, live_output: bool = False ): @@ -582,7 +579,7 @@ def _check_dockerfile_has_gpu_variables(self, model_info: typing.Dict) -> typing dockerfile_content = f.read() # Parse GPU architecture variables from Dockerfile - dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + dockerfile_gpu_vars = parse_dockerfile_gpu_variables(dockerfile_content) if dockerfile_gpu_vars: return True, dockerfile_path @@ -633,7 +630,7 @@ def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, targ dockerfile_content = f.read() # Parse GPU architecture variables from Dockerfile - dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + dockerfile_gpu_vars = parse_dockerfile_gpu_variables(dockerfile_content) if not dockerfile_gpu_vars: # No GPU variables found - target arch is acceptable @@ -643,7 +640,7 @@ def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, targ # Validate target architecture against each GPU variable for var_name, var_values in dockerfile_gpu_vars.items(): - if not self._is_target_arch_compatible_with_variable( + if not is_target_arch_compatible_with_variable( var_name, var_values, target_arch ): self.rich_console.print(f"[red]Error: Target architecture '{target_arch}' is not compatible " @@ -662,119 +659,6 @@ def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, targ self.rich_console.print(f"[yellow]Warning: Error validating target architecture for model {model_info['name']}: {e}[/yellow]") return True # Assume compatible on parsing errors - def _parse_dockerfile_gpu_variables(self, dockerfile_content: str) -> typing.Dict[str, typing.List[str]]: - """Parse GPU architecture variables from Dockerfile content.""" - gpu_variables = {} - - for var_name in self.GPU_ARCH_VARIABLES: - # Look for ARG declarations - arg_pattern = rf"ARG\s+{var_name}=([^\s\n]+)" - arg_matches = re.findall(arg_pattern, dockerfile_content, re.IGNORECASE) - - # Look for ENV declarations - env_pattern = rf"ENV\s+{var_name}[=\s]+([^\s\n]+)" - env_matches = re.findall(env_pattern, dockerfile_content, re.IGNORECASE) - - # Process found values - all_matches = arg_matches + env_matches - if all_matches: - # Take the last defined value (in case of multiple definitions) - raw_value = all_matches[-1].strip('"\'') - parsed_values = self._parse_gpu_variable_value(var_name, raw_value) - if parsed_values: - gpu_variables[var_name] = parsed_values - - return gpu_variables - - def _parse_gpu_variable_value(self, var_name: str, raw_value: str) -> typing.List[str]: - """Parse GPU variable value based on variable type and format.""" - architectures = [] - - # Handle different variable formats - if var_name in ["GPU_TARGETS", "GPU_ARCHS", "PYTORCH_ROCM_ARCH"]: - # These often contain multiple architectures separated by semicolons or commas - if ";" in raw_value: - architectures = [arch.strip() for arch in raw_value.split(";") if arch.strip()] - elif "," in raw_value: - architectures = [arch.strip() for arch in raw_value.split(",") if arch.strip()] - else: - architectures = [raw_value.strip()] - else: - # Single architecture value (MAD_SYSTEM_GPU_ARCHITECTURE, GFX_COMPILATION_ARCH) - architectures = [raw_value.strip()] - - # Normalize architecture names - normalized_archs = [] - for arch in architectures: - normalized = self._normalize_architecture_name(arch) - if normalized: - normalized_archs.append(normalized) - - return normalized_archs - - def _normalize_architecture_name(self, arch: str) -> str: - """Normalize architecture name to standard format.""" - arch = arch.lower().strip() - - # Handle common variations and aliases - if arch.startswith("gfx"): - return arch - elif arch in ["mi100", "mi-100"]: - return "gfx908" - elif arch in ["mi200", "mi-200", "mi210", "mi250"]: - return "gfx90a" - elif arch in ["mi300", "mi-300", "mi300a"]: - return "gfx940" - elif arch in ["mi300x", "mi-300x"]: - return "gfx942" - elif arch.startswith("mi"): - # Unknown MI series - return as is for potential future support - return arch - - return arch if arch else None - - def _is_target_arch_compatible_with_variable( - self, - var_name: str, - var_values: typing.List[str], - target_arch: str - ) -> bool: - """ - Validate that target architecture is compatible with a specific GPU variable. - Used during build phase validation. - """ - if var_name == "MAD_SYSTEM_GPU_ARCHITECTURE": - # MAD_SYSTEM_GPU_ARCHITECTURE will be overridden by target_arch, so always compatible - return True - - elif var_name in ["PYTORCH_ROCM_ARCH", "GPU_TARGETS", "GPU_ARCHS"]: - # Multi-architecture variables - target arch must be in the list - return target_arch in var_values - - elif var_name == "GFX_COMPILATION_ARCH": - # Compilation architecture should be compatible with target arch - return len(var_values) == 1 and ( - var_values[0] == target_arch or - self._is_compilation_arch_compatible(var_values[0], target_arch) - ) - - # Unknown variable - assume compatible - return True - - def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: - """Check if compilation architecture is compatible with target architecture.""" - # Define compatibility rules for compilation - compatibility_matrix = { - "gfx908": ["gfx908"], # MI100 - exact match only - "gfx90a": ["gfx90a"], # MI200 - exact match only - "gfx940": ["gfx940"], # MI300A - exact match only - "gfx941": ["gfx941"], # MI300X - exact match only - "gfx942": ["gfx942"], # MI300X - exact match only - } - - compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) - return target_arch in compatible_archs - def _build_model_single_arch( self, model_info: typing.Dict, @@ -976,20 +860,6 @@ def _determine_registry_image_name( return registry_image - def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: - """Check if compilation architecture is compatible with target architecture.""" - # Define compatibility rules for compilation - compatibility_matrix = { - "gfx908": ["gfx908"], # MI100 - exact match only - "gfx90a": ["gfx90a"], # MI200 - exact match only - "gfx940": ["gfx940"], # MI300A - exact match only - "gfx941": ["gfx941"], # MI300X - exact match only - "gfx942": ["gfx942"], # MI300X - exact match only - } - - compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) - return target_arch in compatible_archs - def _build_model_for_arch( self, model_info: typing.Dict, diff --git a/src/madengine/execution/dockerfile_utils.py b/src/madengine/execution/dockerfile_utils.py new file mode 100644 index 00000000..7a7a1bf9 --- /dev/null +++ b/src/madengine/execution/dockerfile_utils.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Pure helpers for parsing Dockerfile GPU variables and validating target architecture. + +Used by DockerBuilder for build-phase validation. No I/O or context dependency. +""" + +import re +import typing + +# GPU architecture variables used in MAD/DLM Dockerfiles +GPU_ARCH_VARIABLES = [ + "MAD_SYSTEM_GPU_ARCHITECTURE", + "PYTORCH_ROCM_ARCH", + "GPU_TARGETS", + "GFX_COMPILATION_ARCH", + "GPU_ARCHS", +] + + +def parse_dockerfile_gpu_variables( + dockerfile_content: str, +) -> typing.Dict[str, typing.List[str]]: + """Parse GPU architecture variables from Dockerfile content.""" + gpu_variables: typing.Dict[str, typing.List[str]] = {} + + for var_name in GPU_ARCH_VARIABLES: + arg_pattern = rf"ARG\s+{var_name}=([^\s\n]+)" + arg_matches = re.findall(arg_pattern, dockerfile_content, re.IGNORECASE) + env_pattern = rf"ENV\s+{var_name}[=\s]+([^\s\n]+)" + env_matches = re.findall(env_pattern, dockerfile_content, re.IGNORECASE) + + all_matches = arg_matches + env_matches + if all_matches: + raw_value = all_matches[-1].strip('"\'') + parsed_values = parse_gpu_variable_value(var_name, raw_value) + if parsed_values: + gpu_variables[var_name] = parsed_values + + return gpu_variables + + +def parse_gpu_variable_value(var_name: str, raw_value: str) -> typing.List[str]: + """Parse GPU variable value based on variable type and format.""" + architectures: typing.List[str] = [] + + if var_name in ["GPU_TARGETS", "GPU_ARCHS", "PYTORCH_ROCM_ARCH"]: + if ";" in raw_value: + architectures = [a.strip() for a in raw_value.split(";") if a.strip()] + elif "," in raw_value: + architectures = [a.strip() for a in raw_value.split(",") if a.strip()] + else: + architectures = [raw_value.strip()] + else: + architectures = [raw_value.strip()] + + normalized_archs = [] + for arch in architectures: + normalized = normalize_architecture_name(arch) + if normalized: + normalized_archs.append(normalized) + return normalized_archs + + +def normalize_architecture_name(arch: str) -> typing.Optional[str]: + """Normalize architecture name to standard format.""" + arch = arch.lower().strip() + if arch.startswith("gfx"): + return arch + if arch in ["mi100", "mi-100"]: + return "gfx908" + if arch in ["mi200", "mi-200", "mi210", "mi250"]: + return "gfx90a" + if arch in ["mi300", "mi-300", "mi300a"]: + return "gfx940" + if arch in ["mi300x", "mi-300x"]: + return "gfx942" + if arch.startswith("mi"): + return arch + return arch if arch else None + + +def is_target_arch_compatible_with_variable( + var_name: str, + var_values: typing.List[str], + target_arch: str, +) -> bool: + """ + Check if target architecture is compatible with a GPU variable's values. + """ + if var_name == "MAD_SYSTEM_GPU_ARCHITECTURE": + return True + if var_name in ["PYTORCH_ROCM_ARCH", "GPU_TARGETS", "GPU_ARCHS"]: + return target_arch in var_values + if var_name == "GFX_COMPILATION_ARCH": + return len(var_values) == 1 and ( + var_values[0] == target_arch + or is_compilation_arch_compatible(var_values[0], target_arch) + ) + return True + + +def is_compilation_arch_compatible(compile_arch: str, target_arch: str) -> bool: + """Check if compilation architecture is compatible with target architecture.""" + compatibility_matrix = { + "gfx908": ["gfx908"], + "gfx90a": ["gfx90a"], + "gfx940": ["gfx940"], + "gfx941": ["gfx941"], + "gfx942": ["gfx942"], + } + compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) + return target_arch in compatible_archs diff --git a/src/madengine/orchestration/image_filtering.py b/src/madengine/orchestration/image_filtering.py new file mode 100644 index 00000000..c65c2240 --- /dev/null +++ b/src/madengine/orchestration/image_filtering.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Pure image filtering logic for run orchestrator. + +Returns filtered image dicts and lists of skipped items so the orchestrator +can handle logging and CSV writes. No I/O or console here. +""" + +from typing import Any, Dict, List, Tuple + + +def filter_images_by_gpu_compatibility( + built_images: Dict[str, Any], + runtime_gpu_vendor: str, + runtime_gpu_arch: str, +) -> Tuple[Dict[str, Any], List[Tuple[str, str]]]: + """ + Filter images compatible with runtime GPU vendor and architecture. + + Args: + built_images: Dictionary of built images from manifest. + runtime_gpu_vendor: Runtime GPU vendor (AMD, NVIDIA, NONE). + runtime_gpu_arch: Runtime GPU architecture (gfx90a, sm_90, etc.). + + Returns: + (compatible_images, skipped_list) where skipped_list is + [(model_name, reason), ...] for orchestrator to log. + """ + compatible: Dict[str, Any] = {} + skipped: List[Tuple[str, str]] = [] + + for model_name, image_info in built_images.items(): + image_gpu_vendor = image_info.get("gpu_vendor", "") + image_arch = image_info.get("gpu_architecture", "") + + if not image_gpu_vendor: + compatible[model_name] = image_info + continue + + if runtime_gpu_vendor == "NONE" or image_gpu_vendor == runtime_gpu_vendor: + if image_arch: + if image_arch == runtime_gpu_arch: + compatible[model_name] = image_info + else: + skipped.append( + (model_name, f"architecture mismatch ({image_arch} != {runtime_gpu_arch})") + ) + else: + compatible[model_name] = image_info + else: + skipped.append( + (model_name, f"GPU vendor mismatch ({image_gpu_vendor} != {runtime_gpu_vendor})") + ) + + return compatible, skipped + + +def filter_images_by_skip_gpu_arch( + built_images: Dict[str, Any], + built_models: Dict[str, Any], + runtime_gpu_arch: str, + disable_skip: bool = False, +) -> Tuple[Dict[str, Any], List[Tuple[str, Dict, str]]]: + """ + Filter out models that should skip the current GPU architecture. + + Implements skip_gpu_arch from model definitions. + + Args: + built_images: Dictionary of built images from manifest. + built_models: Dictionary of model metadata from manifest. + runtime_gpu_arch: Runtime GPU architecture (gfx90a, A100, etc.). + disable_skip: If True, return all images and empty skipped list. + + Returns: + (compatible_images, skipped_list) where skipped_list is + [(model_name, image_info, gpu_arch), ...] for orchestrator to call + _write_skipped_status(model_name, image_info, gpu_arch). + """ + if disable_skip: + return built_images, [] + + compatible: Dict[str, Any] = {} + skipped: List[Tuple[str, Dict, str]] = [] + + for model_name, image_info in built_images.items(): + model_info = built_models.get(model_name, {}) + skip_gpu_arch_str = model_info.get("skip_gpu_arch", "") + + if not skip_gpu_arch_str: + compatible[model_name] = image_info + continue + + skip_list = [arch.strip() for arch in skip_gpu_arch_str.split(",")] + sys_gpu_arch = runtime_gpu_arch + if sys_gpu_arch and "NVIDIA" in sys_gpu_arch: + sys_gpu_arch = sys_gpu_arch.split()[1] + + if sys_gpu_arch in skip_list: + skipped.append((model_name, image_info, runtime_gpu_arch)) + else: + compatible[model_name] = image_info + + return compatible, skipped diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index e1513bb3..4843e79a 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -31,6 +31,10 @@ ) from madengine.core.constants import get_rocm_path from madengine.utils.session_tracker import SessionTracker +from madengine.orchestration.image_filtering import ( + filter_images_by_gpu_compatibility as _filter_by_gpu_compat, + filter_images_by_skip_gpu_arch as _filter_by_skip_gpu_arch, +) class RunOrchestrator: @@ -981,52 +985,22 @@ def _load_credentials(self) -> Optional[Dict]: def _filter_images_by_gpu_compatibility( self, built_images: Dict, runtime_gpu_vendor: str, runtime_gpu_arch: str ) -> Dict: - """Filter images compatible with runtime GPU vendor and architecture. - - Args: - built_images: Dictionary of built images from manifest - runtime_gpu_vendor: Runtime GPU vendor (AMD, NVIDIA, NONE) - runtime_gpu_arch: Runtime GPU architecture (gfx90a, sm_90, etc.) - - Returns: - Dictionary of compatible images - """ + """Filter images compatible with runtime GPU vendor and architecture.""" compatible_images = {} - for model_name, image_info in built_images.items(): - image_gpu_vendor = image_info.get("gpu_vendor", "") - image_arch = image_info.get("gpu_architecture", "") - - # Legacy images without vendor info - treat as compatible for backward compatibility - if not image_gpu_vendor: + if not image_info.get("gpu_vendor", ""): self.rich_console.print( f"[yellow] Warning: {model_name} has no gpu_vendor, treating as compatible (legacy)[/yellow]" ) compatible_images[model_name] = image_info continue - - # Check GPU vendor compatibility first (most important) - if runtime_gpu_vendor == "NONE" or image_gpu_vendor == runtime_gpu_vendor: - # Vendor matches or CPU-only, check architecture if specified - if image_arch: - # Architecture specified, must match - if image_arch == runtime_gpu_arch: - compatible_images[model_name] = image_info - else: - self.rich_console.print( - f"[dim] Skipping {model_name}: architecture mismatch " - f"({image_arch} != {runtime_gpu_arch})[/dim]" - ) - else: - # No architecture specified, vendor match is enough - compatible_images[model_name] = image_info - else: - # Vendor mismatch - self.rich_console.print( - f"[dim] Skipping {model_name}: GPU vendor mismatch " - f"({image_gpu_vendor} != {runtime_gpu_vendor})[/dim]" - ) - + built_with_vendor = {k: v for k, v in built_images.items() if v.get("gpu_vendor")} + compat, skipped = _filter_by_gpu_compat( + built_with_vendor, runtime_gpu_vendor, runtime_gpu_arch + ) + compatible_images.update(compat) + for model_name, reason in skipped: + self.rich_console.print(f"[dim] Skipping {model_name}: {reason}[/dim]") return compatible_images def _filter_images_by_gpu_architecture( @@ -1042,51 +1016,21 @@ def _filter_images_by_gpu_architecture( def _filter_images_by_skip_gpu_arch( self, built_images: Dict, built_models: Dict, runtime_gpu_arch: str ) -> Dict: - """Filter out models that should skip the current GPU architecture. - - This implements the skip_gpu_arch logic from model definitions, - where models can specify GPU architectures they don't support. - - Args: - built_images: Dictionary of built images from manifest - built_models: Dictionary of model metadata from manifest - runtime_gpu_arch: Runtime GPU architecture (gfx90a, A100, etc.) - - Returns: - Dictionary of images that should run (not skipped) - """ - if getattr(self.args, 'disable_skip_gpu_arch', False): - # User disabled skip logic, run all models - self.rich_console.print("[dim] --disable-skip-gpu-arch flag set, skipping GPU architecture checks[/dim]") + """Filter out models that should skip the current GPU architecture.""" + disable = getattr(self.args, "disable_skip_gpu_arch", False) + if disable: + self.rich_console.print( + "[dim] --disable-skip-gpu-arch flag set, skipping GPU architecture checks[/dim]" + ) return built_images - - compatible_images = {} - - for model_name, image_info in built_images.items(): - # Get model metadata to check skip_gpu_arch field - model_info = built_models.get(model_name, {}) - skip_gpu_arch_str = model_info.get("skip_gpu_arch", "") - - if skip_gpu_arch_str: - # Parse comma-separated list of architectures to skip - skip_list = [arch.strip() for arch in skip_gpu_arch_str.split(",")] - - # Normalize architecture comparison (handle "NVIDIA A100" -> "A100") - sys_gpu_arch = runtime_gpu_arch - if sys_gpu_arch and "NVIDIA" in sys_gpu_arch: - sys_gpu_arch = sys_gpu_arch.split()[1] - - if sys_gpu_arch in skip_list: - self.rich_console.print( - f"[yellow] Skipping model {model_name} as it is not supported on {runtime_gpu_arch} architecture.[/yellow]" - ) - - # Write SKIPPED status to perf CSV - self._write_skipped_status(model_name, image_info, runtime_gpu_arch) - continue - - compatible_images[model_name] = image_info - + compatible_images, skipped = _filter_by_skip_gpu_arch( + built_images, built_models, runtime_gpu_arch, disable_skip=False + ) + for model_name, image_info, gpu_arch in skipped: + self.rich_console.print( + f"[yellow] Skipping model {model_name} as it is not supported on {gpu_arch} architecture.[/yellow]" + ) + self._write_skipped_status(model_name, image_info, gpu_arch) return compatible_images def _write_skipped_status(self, model_name: str, image_info: Dict, gpu_arch: str) -> None: diff --git a/src/madengine/reporting/update_perf_super.py b/src/madengine/reporting/update_perf_super.py index f0d5cda5..f0b1753c 100644 --- a/src/madengine/reporting/update_perf_super.py +++ b/src/madengine/reporting/update_perf_super.py @@ -100,11 +100,11 @@ def handle_multiple_results_super( # Parse config file from args if present configs_data = None if 'args' in common_info_json and common_info_json['args']: - # Try to extract config path from args - scripts_path = common_info_json.get('pipeline', '') + # model_scripts_path: use None so resolution relies on config_parser.scripts_base_dir + # (callers pass scripts_base_dir when creating the parser; 'pipeline' is not a path) configs_data = config_parser.parse_and_load( common_info_json['args'], - scripts_path + None ) # Process each result row @@ -118,7 +118,10 @@ def handle_multiple_results_super( # Extract standard performance/metric columns record["performance"] = result_row.pop("performance") record["metric"] = result_row.pop("metric") - + # test_duration for Duration column in reports (avoid N/A when CSV has it) + _td = result_row.pop("test_duration", "") + record["test_duration"] = "" if (_td is None or _td == "" or pd.isna(_td)) else str(_td) + # Put remaining metrics into multi_results # Exclude internal fields that shouldn't be in multi_results extra_metrics = {k: v for k, v in result_row.items() diff --git a/src/madengine/utils/path_utils.py b/src/madengine/utils/path_utils.py new file mode 100644 index 00000000..637efb8f --- /dev/null +++ b/src/madengine/utils/path_utils.py @@ -0,0 +1,33 @@ +"""Path resolution helpers for madengine. + +Provides scripts_base_dir_from() and get_madengine_root() for consistent +path handling across execution and deployment. +""" + +import os +from pathlib import Path +from typing import Optional + + +def scripts_base_dir_from(scripts_path: Optional[str]) -> Optional[str]: + """Return the directory containing the given scripts path, or None. + + Args: + scripts_path: Path to a script file (or directory). Can be None or empty. + + Returns: + Parent directory as string, or None if scripts_path is None/empty. + """ + if not scripts_path or not str(scripts_path).strip(): + return None + return os.path.dirname(scripts_path) + + +def get_madengine_root() -> Path: + """Return the madengine package root directory (where madengine/__init__.py lives). + + Returns: + Path to the madengine package root. + """ + import madengine + return Path(madengine.__file__).resolve().parent diff --git a/src/madengine/utils/run_details.py b/src/madengine/utils/run_details.py new file mode 100644 index 00000000..c21776b0 --- /dev/null +++ b/src/madengine/utils/run_details.py @@ -0,0 +1,31 @@ +"""Shared helpers for run-details / perf row construction. + +Centralizes pipeline and build_number from environment so container_runner +and deployment (e.g. kubernetes) use the same values. Also provides +flatten_tags_in_place for consistent tags handling. +""" + +import os +from typing import Any, Dict + + +def get_pipeline() -> str: + """Return pipeline name from environment (e.g. CI pipeline).""" + return os.environ.get("pipeline", "") + + +def get_build_number() -> str: + """Return build number from environment (e.g. CI BUILD_NUMBER).""" + return os.environ.get("BUILD_NUMBER", "0") + + +def flatten_tags_in_place(record: Dict[str, Any]) -> None: + """Flatten 'tags' in a run-details record in place. + + If record['tags'] is a list, convert it to a comma-separated string. + Matches behavior expected by perf CSV and update_perf_csv. + """ + if "tags" not in record: + return + if isinstance(record["tags"], list): + record["tags"] = ",".join(str(item) for item in record["tags"]) diff --git a/tests/e2e/test_build_workflows.py b/tests/e2e/test_build_workflows.py index 452c8af4..aec87459 100644 --- a/tests/e2e/test_build_workflows.py +++ b/tests/e2e/test_build_workflows.py @@ -24,6 +24,7 @@ from tests.fixtures.utils import BASE_DIR, MODEL_DIR from tests.fixtures.utils import global_data from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import DEFAULT_CLEAN_FILES from tests.fixtures.utils import generate_additional_context_for_machine @@ -163,7 +164,7 @@ class TestDiscover: """Test the model discovery feature.""" @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_static(self, global_data, clean_test_temp_files): """ @@ -189,7 +190,7 @@ def test_static(self, global_data, clean_test_temp_files): pytest.fail("dummy2/model2 did not run successfully.") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_dynamic(self, global_data, clean_test_temp_files): """ @@ -215,7 +216,7 @@ def test_dynamic(self, global_data, clean_test_temp_files): pytest.fail("dummy3/model4 did not run successfully.") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_additional_args(self, global_data, clean_test_temp_files): """ @@ -245,7 +246,7 @@ def test_additional_args(self, global_data, clean_test_temp_files): pytest.fail("dummy2/model2:batch-size=32 did not run successfully.") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_multiple(self, global_data, clean_test_temp_files): """ diff --git a/tests/e2e/test_data_workflows.py b/tests/e2e/test_data_workflows.py index 93709051..42e564e6 100644 --- a/tests/e2e/test_data_workflows.py +++ b/tests/e2e/test_data_workflows.py @@ -18,6 +18,7 @@ from tests.fixtures.utils import BASE_DIR, MODEL_DIR from tests.fixtures.utils import global_data from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import DEFAULT_CLEAN_FILES from madengine.core.dataprovider import Data @@ -85,7 +86,7 @@ def test_reorder_data_provider_config(self): os.unlink(temp_file_path) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_local_data_provider_runs_successfully( self, global_data, clean_test_temp_files diff --git a/tests/e2e/test_execution_features.py b/tests/e2e/test_execution_features.py index 7a0ac120..fae4cb70 100644 --- a/tests/e2e/test_execution_features.py +++ b/tests/e2e/test_execution_features.py @@ -15,6 +15,12 @@ from tests.fixtures.utils import clean_test_temp_files from tests.fixtures.utils import is_nvidia from tests.fixtures.utils import generate_additional_context_for_machine +from tests.fixtures.utils import ( + DEFAULT_CLEAN_FILES, + build_run_command, + get_run_live_log_path, + get_timeout_seconds_from_log, +) @@ -25,180 +31,35 @@ class TestCustomTimeoutsFunctionality: @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) - def test_default_model_timeout_2hrs(self, global_data, clean_test_temp_files): - """ - default model timeout is 2 hrs - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy" - ) - - regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") - foundTimeout = None - with open( - os.path.join( - BASE_DIR, - "dummy_dummy.ubuntu." - + ("amd" if not is_nvidia() else "nvidia") - + ".run.live.log", - ), - "r", - ) as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != "7200": - pytest.fail("default model timeout is not 2 hrs (" + str(foundTimeout) + "s).") - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_can_override_timeout_in_model(self, global_data, clean_test_temp_files): - """ - timeout can be overridden in model - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_timeout" - ) - - regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") - foundTimeout = None - with open( - os.path.join( - BASE_DIR, - "dummy_timeout_dummy.ubuntu." - + ("amd" if not is_nvidia() else "nvidia") - + ".run.live.log", - ), - "r", - ) as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != "360": - pytest.fail( - "timeout in models.json (360s) could not override actual timeout (" - + str(foundTimeout) - + "s)." - ) - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "tags,log_base_name,expected_seconds,extra_args", + [ + ("dummy", "dummy_dummy", "7200", ""), + ("dummy_timeout", "dummy_timeout_dummy", "360", ""), + ("dummy", "dummy_dummy", "120", "--timeout 120"), + ("dummy_timeout", "dummy_timeout_dummy", "120", "--timeout 120"), + ], ) - def test_can_override_timeout_in_commandline( - self, global_data, clean_test_temp_files - ): - """ - timeout command-line argument overrides default timeout - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy --timeout 120" - ) - - regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") - foundTimeout = None - with open( - os.path.join( - BASE_DIR, - "dummy_dummy.ubuntu." - + ("amd" if not is_nvidia() else "nvidia") - + ".run.live.log", - ), - "r", - ) as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != "120": - pytest.fail( - "timeout command-line argument (120s) could not override actual timeout (" - + foundTimeout - + "s)." - ) - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_commandline_timeout_overrides_model_timeout( - self, global_data, clean_test_temp_files + def test_timeout_value_in_log( + self, global_data, clean_test_temp_files, tags, log_base_name, expected_seconds, extra_args ): """ - timeout command-line argument overrides model timeout - This test only checks if the timeout is set; it does not actually time the model. + Timeout is set as expected (default 2h, model override, CLI override). + Only checks the value in the log; does not actually time the model. """ - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_timeout --timeout 120" - ) - - regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") - foundTimeout = None - with open( - os.path.join( - BASE_DIR, - "dummy_timeout_dummy.ubuntu." - + ("amd" if not is_nvidia() else "nvidia") - + ".run.live.log", - ), - "r", - ) as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != "120": + global_data["console"].sh(build_run_command(tags, extra_args=extra_args)) + log_path = get_run_live_log_path(log_base_name) + found = get_timeout_seconds_from_log(log_path) + if found != expected_seconds: pytest.fail( - "timeout in command-line argument (360s) could not override model.json timeout (" - + foundTimeout - + "s)." + f"expected timeout {expected_seconds}s in log, got {found}s (log: {log_path})." ) @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_timeout_in_commandline_timesout_correctly( @@ -226,7 +87,7 @@ def test_timeout_in_commandline_timesout_correctly( @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_timeout_in_model_timesout_correctly( @@ -263,7 +124,7 @@ class TestDebuggingFunctionality: @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_keepAlive_keeps_docker_alive(self, global_data, clean_test_temp_files): @@ -301,7 +162,7 @@ def test_keepAlive_keeps_docker_alive(self, global_data, clean_test_temp_files): @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_no_keepAlive_does_not_keep_docker_alive( @@ -342,7 +203,7 @@ def test_no_keepAlive_does_not_keep_docker_alive( @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_keepAlive_preserves_model_dir(self, global_data, clean_test_temp_files): @@ -374,7 +235,7 @@ def test_keepAlive_preserves_model_dir(self, global_data, clean_test_temp_files) @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_keepModelDir_keeps_model_dir(self, global_data, clean_test_temp_files): @@ -398,7 +259,7 @@ def test_keepModelDir_keeps_model_dir(self, global_data, clean_test_temp_files): @pytest.mark.parametrize( "clean_test_temp_files", - [["perf.csv", "perf.html", "run_directory"]], + [DEFAULT_CLEAN_FILES + ["run_directory"]], indirect=True, ) def test_no_keepModelDir_does_not_keep_model_dir( @@ -432,7 +293,7 @@ class TestLiveOutputFunctionality: """Test the live output functionality.""" @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_default_silent_run(self, global_data, clean_test_temp_files): """ @@ -458,7 +319,7 @@ def test_default_silent_run(self, global_data, clean_test_temp_files): pytest.fail("default run is not silent") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_liveOutput_prints_output_to_screen( self, global_data, clean_test_temp_files diff --git a/tests/e2e/test_profiling_workflows.py b/tests/e2e/test_profiling_workflows.py index 907dcb7c..bd352d6c 100644 --- a/tests/e2e/test_profiling_workflows.py +++ b/tests/e2e/test_profiling_workflows.py @@ -19,6 +19,7 @@ from tests.fixtures.utils import ( BASE_DIR, MODEL_DIR, + DEFAULT_CLEAN_FILES, global_data, clean_test_temp_files, requires_gpu, @@ -270,7 +271,7 @@ def test_miopen_trace_runs_correctly(self, global_data, clean_test_temp_files): @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): """ @@ -311,7 +312,7 @@ def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_toolA_runs_correctly(self, global_data, clean_test_temp_files): """ @@ -357,7 +358,7 @@ def test_toolA_runs_correctly(self, global_data, clean_test_temp_files): pytest.fail("all strings were not matched in toolA test.") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_stackable_design_runs_correctly(self, global_data, clean_test_temp_files): """ diff --git a/tests/e2e/test_run_workflows.py b/tests/e2e/test_run_workflows.py index 5b3f894d..bf770891 100644 --- a/tests/e2e/test_run_workflows.py +++ b/tests/e2e/test_run_workflows.py @@ -21,6 +21,11 @@ from tests.fixtures.utils import get_num_cpus from tests.fixtures.utils import requires_gpu from tests.fixtures.utils import generate_additional_context_for_machine +from tests.fixtures.utils import ( + DEFAULT_CLEAN_FILES, + build_run_command, + assert_model_in_perf_csv, +) from madengine.core.context import Context @@ -32,72 +37,33 @@ class TestContexts: @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", + [DEFAULT_CLEAN_FILES + ["ctx_test"]], + indirect=True, ) - def test_dockerfile_picked_on_detected_context_0( - self, global_data, clean_test_temp_files - ): - """ - picks dockerfile based on detected context and only those - """ - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest " - ) - - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row["model"] == "dummy_ctxtest": - if row["status"] == "SUCCESS" and row["performance"] == "0": - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html", "ctx_test"]], indirect=True + "ctx_value,expected_performance", + [(None, "0"), ("1", "1")], ) - def test_dockerfile_picked_on_detected_context_1( - self, global_data, clean_test_temp_files + def test_dockerfile_picked_on_detected_context( + self, global_data, clean_test_temp_files, ctx_value, expected_performance ): """ - picks dockerfile based on detected context and only those - """ - with open(os.path.join(BASE_DIR, "ctx_test"), "w") as ctx_test_file: - print("1", file=ctx_test_file) - - global_data["console"].sh( - "cd " - + BASE_DIR - + "; " - + "MODEL_DIR=" - + MODEL_DIR - + " " - + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest " + Picks dockerfile based on detected context (no ctx_test file -> 0; ctx_test=1 -> 1). + """ + if ctx_value is not None: + with open(os.path.join(BASE_DIR, "ctx_test"), "w") as f: + print(ctx_value, file=f) + global_data["console"].sh(build_run_command("dummy_ctxtest")) + assert_model_in_perf_csv( + os.path.join(BASE_DIR, "perf.csv"), + "dummy_ctxtest", + status="SUCCESS", + performance=expected_performance, ) - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row["model"] == "dummy_ctxtest": - if row["status"] == "SUCCESS" and row["performance"] == "1": - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html", "ctx_test"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES + ["ctx_test"]], indirect=True ) def test_all_dockerfiles_matching_context_executed( self, global_data, clean_test_temp_files @@ -146,7 +112,7 @@ def test_dockerfile_executed_if_contexts_keys_are_not_common(self): pass @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_can_override_context_with_additionalContext_commandline( self, global_data, clean_test_temp_files @@ -245,7 +211,7 @@ def test_additionalContext_commandline_overrides_additionalContextFile( pytest.fail("model did not pick correct context.") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_base_docker_override(self, global_data, clean_test_temp_files): """ @@ -277,7 +243,7 @@ def test_base_docker_override(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_docker_image_override(self, global_data, clean_test_temp_files): """ @@ -309,7 +275,7 @@ def test_docker_image_override(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_docker_env_vars_override(self, global_data, clean_test_temp_files): """ @@ -340,7 +306,7 @@ def test_docker_env_vars_override(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_docker_mounts_mount_host_paths_in_docker_container( self, global_data, clean_test_temp_files @@ -489,7 +455,7 @@ def test_gpu_product_name_matches_arch(self): class TestTagsFunctionality: @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_can_select_model_subset_with_commandline_tag_argument( self, global_data, clean_test_temp_files @@ -516,7 +482,7 @@ def test_can_select_model_subset_with_commandline_tag_argument( pytest.fail("dummy2 tag not selected with commandline --tags argument") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_all_models_matching_any_tag_selected_with_multiple_tags( self, global_data, clean_test_temp_files @@ -546,7 +512,7 @@ def test_all_models_matching_any_tag_selected_with_multiple_tags( pytest.fail("dummy3 tag not selected with commandline --tags argument") @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_model_names_are_automatically_tags( self, global_data, clean_test_temp_files diff --git a/tests/e2e/test_scripting_workflows.py b/tests/e2e/test_scripting_workflows.py index 682eb53a..0f44d7d3 100644 --- a/tests/e2e/test_scripting_workflows.py +++ b/tests/e2e/test_scripting_workflows.py @@ -17,6 +17,7 @@ from tests.fixtures.utils import BASE_DIR, MODEL_DIR from tests.fixtures.utils import global_data from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import DEFAULT_CLEAN_FILES from tests.fixtures.utils import is_nvidia from tests.fixtures.utils import generate_additional_context_for_machine @@ -24,7 +25,7 @@ class TestPrePostScriptsFunctionality: @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_pre_scripts_run_before_model(self, global_data, clean_test_temp_files): """ @@ -64,7 +65,7 @@ def test_pre_scripts_run_before_model(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_post_scripts_run_after_model(self, global_data, clean_test_temp_files): """ @@ -104,7 +105,7 @@ def test_post_scripts_run_after_model(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_pre_scripts_accept_arguments(self, global_data, clean_test_temp_files): """ @@ -144,7 +145,7 @@ def test_pre_scripts_accept_arguments(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_post_scripts_accept_arguments(self, global_data, clean_test_temp_files): """ @@ -184,7 +185,7 @@ def test_post_scripts_accept_arguments(self, global_data, clean_test_temp_files) ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_both_pre_and_post_scripts_run_before_and_after_model( self, global_data, clean_test_temp_files @@ -249,7 +250,7 @@ def test_both_pre_and_post_scripts_run_before_and_after_model( ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_all_pre_scripts_run_in_order(self, global_data, clean_test_temp_files): """ @@ -297,7 +298,7 @@ def test_all_pre_scripts_run_in_order(self, global_data, clean_test_temp_files): ) @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + "clean_test_temp_files", [DEFAULT_CLEAN_FILES], indirect=True ) def test_all_post_scripts_run_in_order(self, global_data, clean_test_temp_files): """ diff --git a/tests/fixtures/dummy/credential.json b/tests/fixtures/dummy/credential.json index 04e514b5..67675fef 100644 --- a/tests/fixtures/dummy/credential.json +++ b/tests/fixtures/dummy/credential.json @@ -1,21 +1,42 @@ { - "NAS_NODES": [ + "ROCmSoftwarePlatform": { + "username": "your-username", + "password": "your-token-or-password" + }, + "PUBLIC_GITHUB_ROCM_KEY": { + "username": "your-github-username", + "password": "your-github-token" + }, + "gerrit-git.amd.com": { + "username": "your-gerrit-username", + "ssh_key_file": "id_rsa" + }, + "AMD_GITHUB": { + "username": "your-github-amd-username", + "password": "your-credential" + }, + "MAD_AWS_S3": { + "USERNAME": "your-aws-access-key", + "PASSWORD": "your-aws-secret-key" + }, + "NAS_NODES": [ { - "NAME": "default", - "HOST": "localhost", - "PORT": "22", - "USERNAME": "admin", - "PASSWORD": "admin" + "NAME": "default", + "HOST": "localhost", + "PORT": "22", + "USERNAME": "username", + "PASSWORD": "password" } - ], - "MAD_AWS_S3": { - "USERNAME": "admin", - "PASSWORD": "admin" - }, - "MAD_MINIO": { - "USERNAME": "admin-access-key", - "PASSWORD": "admin-secret-key", - "MINIO_ENDPOINT": "http://127.0.1:9000", - "AWS_ENDPOINT_URL_S3": "http://127.0.1:9000" - } -} \ No newline at end of file + ], + "MAD_MINIO": { + "USERNAME": "your-minio-username", + "PASSWORD": "your-minio-password", + "MINIO_ENDPOINT": "http://localhost:9000", + "AWS_ENDPOINT_URL_S3": "http://localhost:9000" + }, + "dockerhub": { + "repository": "your-org/your-repo", + "username": "your-docker-username", + "password": "your-docker-token" + } +} diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 28b3db7f..fab99fae 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -264,6 +264,20 @@ ], "args": "" }, + { + "name": "dummy_torchrun_multi", + "dockerfile": "docker/dummy_torchrun", + "scripts": "scripts/dummy_torchrun/run_multi.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed" + ], + "args": "", + "multiple_results": "perf_dummy_torchrun.csv" + }, { "name": "dummy_torchrun_helper", "dockerfile": "docker/dummy_torchrun", diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_multi.sh b/tests/fixtures/dummy/scripts/dummy_torchrun/run_multi.sh new file mode 100644 index 00000000..b6093ebe --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_multi.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# +# Bash wrapper for dummy_torchrun_multi: torchrun + multiple_results CSV output +# Uses MAD_MULTI_NODE_RUNNER for torchrun launcher; writes perf_dummy_torchrun.csv +# + +set -e + +echo "========================================================================" +echo "madengine Torchrun Multi (multiple_results) Wrapper Script" +echo "========================================================================" + +# Get current directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Determine multi-node runner to use +if [ -z "$MAD_MULTI_NODE_RUNNER" ]; then + N_GPUS="${MAD_RUNTIME_NGPUS:-1}" + echo "ℹ️ MAD_MULTI_NODE_RUNNER not set, using standalone torchrun" + echo "ℹ️ Using $N_GPUS GPUs" + MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node=$N_GPUS" +fi + +echo "========================================================================" +echo "Launcher Command:" +echo "$MAD_MULTI_NODE_RUNNER" +echo "========================================================================" + +# Create MIOpen cache directory if MIOPEN_USER_DB_PATH is set +if [ -n "$MIOPEN_USER_DB_PATH" ]; then + MIOPEN_BASE_DIR=$(dirname "$MIOPEN_USER_DB_PATH") + mkdir -p "$MIOPEN_BASE_DIR" + echo "ℹ️ MIOpen cache directory: $MIOPEN_USER_DB_PATH" +fi + +# Execute the Python script (multiple_results variant) +echo "Executing: $MAD_MULTI_NODE_RUNNER run_torchrun_multi.py" +$MAD_MULTI_NODE_RUNNER run_torchrun_multi.py +PYTHON_EXIT_CODE=$? + +echo "========================================================================" +echo "Training script completed with exit code: $PYTHON_EXIT_CODE" +echo "========================================================================" + +if [ $PYTHON_EXIT_CODE -ne 0 ]; then + echo "ERROR: Training script failed with exit code $PYTHON_EXIT_CODE" + exit $PYTHON_EXIT_CODE +fi diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py new file mode 100644 index 00000000..dc09b9cf --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_multi.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +PyTorch Distributed Training Benchmark for madengine (multiple_results variant) + +Same as run_torchrun.py but writes a CSV file for multiple_results collection: +- perf_dummy_torchrun.csv with columns: model,temperature,performance,metric,test_duration +- Compatible with madengine multiple_results aggregation + +Usage: + torchrun --standalone --nproc_per_node=1 run_torchrun_multi.py + torchrun --standalone --nproc_per_node=8 run_torchrun_multi.py +""" + +import os +import sys +import time +import socket +import csv +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Configuration +BATCH_SIZE = 128 # Per-GPU batch size +NUM_EPOCHS = 5 +NUM_BATCHES = 100 # Number of synthetic batches per epoch +IMAGE_SIZE = 224 +NUM_CLASSES = 1000 +MULTI_RESULTS_CSV = "perf_dummy_torchrun.csv" + +# Get distributed environment variables (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) +master_addr = os.environ.get("MASTER_ADDR", "localhost") +master_port = os.environ.get("MASTER_PORT", "29500") + + +def print_header(): + """Print benchmark header""" + print("=" * 70) + print("madengine PyTorch Distributed Training Benchmark (multi_results)") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Rank: {rank}/{world_size}") + print(f"Local Rank (GPU): {local_rank}") + if world_size > 1: + print(f"Master: {master_addr}:{master_port}") + print(f"\nConfiguration:") + print(f" Batch Size (per GPU): {BATCH_SIZE}") + print(f" Global Batch Size: {BATCH_SIZE * world_size}") + print(f" Epochs: {NUM_EPOCHS}") + print(f" Batches per Epoch: {NUM_BATCHES}") + print(f" Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") + print(f" Num Classes: {NUM_CLASSES}") + print("=" * 70) + + +class SimpleCNN(nn.Module): + """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): + super(SimpleCNN, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(128) + + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm2d(256) + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, num_classes) + + def forward(self, x): + x = self.pool(F.relu(self.bn1(self.conv1(x)))) + x = self.pool(F.relu(self.bn2(self.conv2(x)))) + x = self.pool(F.relu(self.bn3(self.conv3(x)))) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + return x + + +def generate_synthetic_batch(batch_size, device): + """Generate synthetic data for benchmarking""" + images = torch.randn(batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, NUM_CLASSES, (batch_size,), device=device) + return images, labels + + +def train_epoch(model, optimizer, criterion, epoch, device): + """Train for one epoch with node-local throughput measurement""" + model.train() + epoch_start = time.time() + total_samples = 0 + total_loss = 0.0 + + for batch_idx in range(NUM_BATCHES): + batch_start = time.time() + + # Generate synthetic data + images, labels = generate_synthetic_batch(BATCH_SIZE, device) + + # Forward pass + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward pass (gradients are automatically synchronized across GPUs) + loss.backward() + + # Update weights + optimizer.step() + + batch_time = time.time() - batch_start + total_samples += BATCH_SIZE + total_loss += loss.item() + + # Print progress from local rank 0 on each node + if local_rank == 0 and (batch_idx + 1) % 20 == 0: + avg_loss = total_loss / (batch_idx + 1) + throughput = BATCH_SIZE / batch_time # Local throughput + print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec (local)") + + epoch_time = time.time() - epoch_start + avg_loss = total_loss / NUM_BATCHES + + # ======================================================================== + # Node-Local Throughput Measurement + # ======================================================================== + local_samples = NUM_BATCHES * BATCH_SIZE + local_gpu_throughput = local_samples / epoch_time + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + node_throughput = local_gpu_throughput * local_world_size + + metrics = { + 'avg_loss': avg_loss, + 'node_throughput': node_throughput, + 'epoch_time': epoch_time, + 'local_world_size': local_world_size + } + + return metrics + + +def main(): + """Main training function""" + test_start_time = time.time() + + print_header() + + # Create per-process MIOpen cache directory to avoid database conflicts + if "MIOPEN_USER_DB_PATH" in os.environ: + miopen_template = os.environ["MIOPEN_USER_DB_PATH"] + miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + os.makedirs(miopen_path, exist_ok=True) + print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") + + # Initialize distributed training + if world_size > 1: + print(f"\n[Rank {rank}] Initializing distributed process group...") + dist.init_process_group( + backend="nccl", + init_method=f"env://", + world_size=world_size, + rank=rank + ) + print(f"[Rank {rank}] ✓ Process group initialized") + print(f"[Rank {rank}] Backend: {dist.get_backend()}") + print(f"[Rank {rank}] World Size: {dist.get_world_size()}") + else: + print(f"\n=== Running in Standalone Mode (Single GPU) ===") + + # Set device + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") + print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") + + if local_rank >= num_gpus: + print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print(f"[Rank {rank}] Using cuda:0 instead") + device = torch.device("cuda:0") + else: + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") + else: + device = torch.device("cpu") + print(f"[Rank {rank}] Warning: CUDA not available, using CPU") + + # Create model + print(f"\n[Rank {rank}] Creating model...") + model = SimpleCNN(num_classes=NUM_CLASSES).to(device) + + if world_size > 1: + model = DDP( + model, + device_ids=[local_rank], + output_device=local_rank, + broadcast_buffers=True, + find_unused_parameters=False + ) + print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") + + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss() + + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + node_rank = rank // local_world_size if local_world_size > 0 else 0 + + if local_rank == 0: + print(f"\n{'='*70}") + print(f"[Node {node_rank}] Starting Training") + print(f"{'='*70}") + + # Training loop + all_metrics = [] + for epoch in range(NUM_EPOCHS): + metrics = train_epoch( + model, optimizer, criterion, epoch, device + ) + all_metrics.append(metrics) + + if local_rank == 0: + print(f"\n[Node {node_rank}] Epoch [{epoch+1}/{NUM_EPOCHS}] Complete:") + print(f" Average Loss: {metrics['avg_loss']:.4f}") + print(f" Node Throughput: {metrics['node_throughput']:.2f} samples/sec") + print(f" Local GPUs: {metrics['local_world_size']}") + + avg_node_throughput = sum(m['node_throughput'] for m in all_metrics) / len(all_metrics) + avg_epoch_time = sum(m['epoch_time'] for m in all_metrics) / len(all_metrics) + + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # Node-Local Performance Reporting + multiple_results CSV + if local_rank == 0: + print(f"\n{'='*70}") + print("Node Performance Summary") + print(f"{'='*70}") + print(f"Node ID: {node_rank}") + print(f"Node Hostname: {socket.gethostname()}") + print(f"Local GPUs: {local_world_size}") + print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") + print(f"Avg Time per Epoch: {avg_epoch_time:.2f}s") + print(f"{'='*70}") + + print(f"performance: {avg_node_throughput:.2f} samples_per_second", flush=True) + print(f"node_id: {node_rank}", flush=True) + print(f"local_gpus: {local_world_size}", flush=True) + + test_duration = time.time() - test_start_time + print(f"test_duration: {test_duration:.2f}s", flush=True) + sys.stdout.flush() + + # Write multiple_results CSV (model,temperature,performance,metric,test_duration for duration in reports) + with open(MULTI_RESULTS_CSV, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model", "temperature", "performance", "metric", "test_duration"]) + test_dur_str = f"{test_duration:.2f}s" + for i in range(4): + # Vary temperature for multiple rows; use throughput for performance + writer.writerow([ + i + 1, + 20 + i * 5, + f"{avg_node_throughput:.2f}", + "samples_per_sec", + test_dur_str, + ]) + print(f"Wrote {MULTI_RESULTS_CSV} for multiple_results collection", flush=True) + + # Cleanup + if world_size > 1: + dist.destroy_process_group() + if rank == 0: + print(f"✓ Process group destroyed") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/utils.py b/tests/fixtures/utils.py index 64b9d50b..21198b90 100644 --- a/tests/fixtures/utils.py +++ b/tests/fixtures/utils.py @@ -4,12 +4,13 @@ """ # built-in modules +import csv import os +import re +import shutil +import subprocess import sys import json -import subprocess -import shutil -import re import pytest from unittest.mock import MagicMock @@ -305,10 +306,10 @@ def get_num_cpus() -> int: int: Number of CPUs present. """ global _num_cpus_cache - + if _num_cpus_cache is not None: return _num_cpus_cache - + try: # Lazy import to avoid collection issues from madengine.core.console import Console @@ -319,3 +320,110 @@ def get_num_cpus() -> int: # Default to 64 CPUs if detection fails during collection _num_cpus_cache = 64 return _num_cpus_cache + + +# ============================================================================= +# E2E test helpers (run command, perf CSV, log path, timeout from log) +# ============================================================================= + +# Default list of perf output files to clean before/after e2e tests. +DEFAULT_CLEAN_FILES = ["perf.csv", "perf.html"] + + +def build_run_command(tags, extra_args="", output_file=None, additional_context=None): + """Build the base shell command for 'madengine run' e2e tests. + + Args: + tags: Model tag(s) string, e.g. 'dummy' or 'dummy_ctxtest'. + extra_args: Optional CLI args to append (e.g. '--timeout 120'). + output_file: Optional output CSV path (e.g. 'perf.csv' or 'perf_test.csv'). + additional_context: Optional dict or JSON-str for --additional-context. + If dict, it is json.dumps'd and wrapped in single quotes. + + Returns: + str: Full command to run in shell (cd BASE_DIR; MODEL_DIR=... python3 -m ...). + """ + parts = [ + "cd " + BASE_DIR + ";", + "MODEL_DIR=" + MODEL_DIR, + "python3 -m madengine.cli.app run --live-output --tags " + tags, + ] + cmd = " ".join(parts) + if output_file: + cmd += " -o " + output_file + if additional_context is not None: + if isinstance(additional_context, dict): + ctx_str = json.dumps(additional_context) + cmd += ' --additional-context "' + ctx_str.replace('"', '\\"') + '"' + else: + ctx_str = additional_context + cmd += " --additional-context '" + ctx_str.replace("'", "'\"'\"'") + "'" + if extra_args: + cmd += " " + extra_args.strip() + return cmd + + +def assert_model_in_perf_csv(csv_path, model, status="SUCCESS", performance=None): + """Assert that a model row exists in perf CSV with given status and optional performance. + + Fails with pytest.fail if no matching row or row does not match expectations. + + Args: + csv_path: Path to the perf CSV file. + model: Model name to find (e.g. 'dummy_ctxtest' or 'dummy'). + status: Expected status string (default 'SUCCESS'). + performance: Optional expected performance value (compared as string to CSV). + """ + if not os.path.exists(csv_path): + pytest.fail(f"Perf CSV not found: {csv_path}") + with open(csv_path, "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row.get("model") != model: + continue + if row.get("status") != status: + pytest.fail( + f"model {model} in perf CSV did not run successfully (status={row.get('status')})." + ) + if performance is not None and str(row.get("performance", "")) != str(performance): + pytest.fail( + f"model {model} expected performance {performance}, got {row.get('performance')}." + ) + return + pytest.fail(f"model {model} not found in perf CSV.") + + +def get_run_live_log_path(log_base_name, suffix=".run.live.log"): + """Return the path to a run live log file for the given base name and vendor. + + Args: + log_base_name: Base name without extension, e.g. 'dummy_dummy' or 'dummy_timeout_dummy'. + suffix: Log file suffix (default '.run.live.log'). + + Returns: + str: Absolute path under BASE_DIR to the log file. + """ + vendor = "amd" if not is_nvidia() else "nvidia" + return os.path.join(BASE_DIR, log_base_name + ".ubuntu." + vendor + suffix) + + +def get_timeout_seconds_from_log(log_path, timeout_regex=None): + """Read the first 'Setting timeout to N seconds' line from a run log and return N. + + Args: + log_path: Path to the run live log file. + timeout_regex: Optional compiled regex; default matches '⏰ Setting timeout to ([0-9]*) seconds.' + + Returns: + str or None: The timeout seconds string, or None if not found. + """ + if timeout_regex is None: + timeout_regex = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") + if not os.path.exists(log_path): + return None + with open(log_path, "r") as f: + for line in f: + match = timeout_regex.search(line) + if match: + return match.group(1) + return None diff --git a/tests/integration/test_batch_manifest_integration.py b/tests/integration/test_batch_manifest_integration.py deleted file mode 100644 index 86841b33..00000000 --- a/tests/integration/test_batch_manifest_integration.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Integration tests for batch manifest build workflow. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -import json -import os -import tempfile - -import pytest -from typer.testing import CliRunner - -from madengine.cli import app - - -class TestBatchManifestBuildIntegration: - """Integration tests for batch manifest build functionality.""" - - def test_batch_manifest_mutually_exclusive_with_tags(self): - """Test that --batch-manifest and --tags are mutually exclusive.""" - runner = CliRunner() - - # Create a simple batch manifest - batch_data = [{"model_name": "dummy", "build_new": True}] - - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(batch_data, f) - batch_file = f.name - - try: - # Test that using both options is rejected - result = runner.invoke( - app, - [ - "build", - "--batch-manifest", batch_file, - "--tags", "dummy", - "--additional-context", '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' - ] - ) - - # Should fail with mutual exclusivity error - assert result.exit_code != 0 - assert "Cannot specify both --batch-manifest and --tags" in result.output - finally: - os.unlink(batch_file) - diff --git a/tests/integration/test_cli_error_integration.py b/tests/integration/test_cli_error_integration.py deleted file mode 100644 index b04de1bb..00000000 --- a/tests/integration/test_cli_error_integration.py +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/env python3 -""" -Unit tests for madengine CLI error handling integration. - -Tests the integration of unified error handling in mad_cli.py and -distributed_orchestrator.py components. -""" - -import pytest -import json -import os -import tempfile -from unittest.mock import Mock, patch, MagicMock, mock_open -from rich.console import Console - -# Add src to path for imports -import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) - -from madengine.core.errors import ( - ErrorHandler, - ConfigurationError, - set_error_handler, - get_error_handler, - create_error_context -) - - -class TestMadCLIErrorIntegration: - """Test mad_cli.py error handling integration.""" - - @patch('madengine.cli.utils.Console') - def test_setup_logging_creates_error_handler(self, mock_console_class): - """Test that setup_logging initializes the unified error handler.""" - from madengine.cli import setup_logging - - mock_console = Mock(spec=Console) - mock_console_class.return_value = mock_console - - # Clear any existing global error handler - set_error_handler(None) - - # Call setup_logging - setup_logging(verbose=True) - - # Verify error handler was set - handler = get_error_handler() - assert handler is not None - assert isinstance(handler, ErrorHandler) - assert handler.verbose is True - - def test_setup_logging_verbose_flag(self): - """Test that verbose flag is properly passed to error handler.""" - from madengine.cli import setup_logging - - # Test with verbose=False - setup_logging(verbose=False) - handler = get_error_handler() - assert handler.verbose is False - - # Test with verbose=True - setup_logging(verbose=True) - handler = get_error_handler() - assert handler.verbose is True - - def test_build_command_error_handling(self): - """Test that build command imports and can use unified error handling.""" - from madengine.cli import ExitCode - - # Test that the import works and error handling is available - try: - # This tests the actual import in mad_cli.py - from madengine.cli import setup_logging - - # Verify error handler can be set up - setup_logging(verbose=False) - - # Verify handle_error can be imported in the context where it's used - from madengine.core.errors import handle_error, create_error_context - - # Create a test error to ensure the system works - error = Exception("Test build error") - context = create_error_context( - operation="build", - phase="build", - component="CLI" - ) - - # This should not raise an exception - handle_error(error, context=context) - - except ImportError as e: - pytest.fail(f"Error handling integration failed: {e}") - - @patch('madengine.cli.utils.console') - def test_cli_error_display_consistency(self, mock_console): - """Test that CLI errors are displayed consistently through unified handler.""" - from madengine.cli import setup_logging - - # Setup logging to initialize error handler - setup_logging(verbose=False) - - # Get the initialized error handler - handler = get_error_handler() - - # Create a test error - error = ConfigurationError( - "Invalid configuration", - context=create_error_context( - operation="cli_command", - component="CLI", - phase="validation" - ) - ) - - # Handle the error through the unified system - handler.handle_error(error) - - # The error should be displayed through Rich console - # (Note: The actual console calls depend on the handler implementation) - assert handler.console is not None - - -class TestErrorHandlingWorkflow: - """Test complete error handling workflow across components.""" - - @patch('madengine.cli.utils.console') - def test_end_to_end_error_flow(self, mock_console): - """Test complete error flow from CLI through orchestrator.""" - from madengine.cli import setup_logging - from madengine.core.errors import ValidationError - - # Setup unified error handling - setup_logging(verbose=True) - handler = get_error_handler() - - # Create an error that might occur in the orchestrator - orchestrator_error = ValidationError( - "Invalid model tag format", - context=create_error_context( - operation="model_discovery", - component="DistributedOrchestrator", - phase="validation", - model_name="invalid::tag" - ), - suggestions=[ - "Use format: model_name:version", - "Check model name contains only alphanumeric characters" - ] - ) - - # Handle the error through the unified system - handler.handle_error(orchestrator_error) - - # Verify the error was processed - assert handler.console is not None - assert orchestrator_error.context.operation == "model_discovery" - assert orchestrator_error.context.component == "DistributedOrchestrator" - assert len(orchestrator_error.suggestions) == 2 - - def test_error_logging_integration(self): - """Test that errors are properly logged with structured data.""" - from madengine.cli import setup_logging - from madengine.core.errors import BuildError - - # Setup logging - setup_logging(verbose=False) - handler = get_error_handler() - - # Create a build error with rich context - build_error = BuildError( - "Docker build failed", - context=create_error_context( - operation="docker_build", - component="DockerBuilder", - phase="build", - model_name="test_model", - additional_info={"dockerfile": "Dockerfile.ubuntu.amd"} - ), - suggestions=["Check Dockerfile syntax", "Verify base image availability"] - ) - - # Mock the logger to capture log calls - with patch.object(handler, 'logger') as mock_logger: - handler.handle_error(build_error) - - # Verify logging was called with structured data - mock_logger.error.assert_called_once() - log_call_args = mock_logger.error.call_args - - # Check the log message - assert "build: Docker build failed" in log_call_args[0][0] - - # Check the extra structured data - extra_data = log_call_args[1]['extra'] - assert extra_data['context']['operation'] == "docker_build" - assert extra_data['context']['component'] == "DockerBuilder" - assert extra_data['recoverable'] is False # BuildError is not recoverable - assert len(extra_data['suggestions']) == 2 - - def test_error_context_serialization(self): - """Test that error contexts can be serialized for logging and debugging.""" - from madengine.core.errors import RuntimeError - - context = create_error_context( - operation="model_execution", - component="ContainerRunner", - phase="runtime", - model_name="llama2", - node_id="worker-node-01", - file_path="/models/llama2/run.sh", - additional_info={ - "container_id": "abc123", - "gpu_count": 2, - "timeout": 3600 - } - ) - - error = RuntimeError( - "Model execution failed with exit code 1", - context=context - ) - - # Test that context can be serialized - context_dict = error.context.__dict__ - json_str = json.dumps(context_dict, default=str) - - # Verify all context information is in the serialized form - assert "model_execution" in json_str - assert "ContainerRunner" in json_str - assert "runtime" in json_str - assert "llama2" in json_str - assert "worker-node-01" in json_str - assert "abc123" in json_str - - -class TestErrorHandlingPerformance: - """Test performance aspects of error handling.""" - - def test_error_handler_initialization_performance(self): - """Test that error handler initialization is fast.""" - import time - from madengine.core.errors import ErrorHandler - from rich.console import Console - - start_time = time.time() - - # Create multiple error handlers - for _ in range(100): - console = Console() - handler = ErrorHandler(console=console, verbose=False) - - end_time = time.time() - - # Should be able to create 100 handlers in under 1 second - assert end_time - start_time < 1.0 - - def test_error_context_creation_performance(self): - """Test that error context creation is efficient.""" - import time - - start_time = time.time() - - # Create many error contexts - for i in range(1000): - context = create_error_context( - operation=f"operation_{i}", - component=f"Component_{i}", - phase="test", - model_name=f"model_{i}", - additional_info={"iteration": i} - ) - - end_time = time.time() - - # Should be able to create 1000 contexts in under 0.1 seconds - assert end_time - start_time < 0.1 - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/integration/test_error_system_integration.py b/tests/integration/test_error_system_integration.py deleted file mode 100644 index bf704da2..00000000 --- a/tests/integration/test_error_system_integration.py +++ /dev/null @@ -1,282 +0,0 @@ -#!/usr/bin/env python3 -""" -Integration tests for madengine unified error handling system. - -This test file focuses on testing the integration without requiring -optional dependencies like paramiko, ansible-runner, or kubernetes. -""" - -import pytest -import json -from unittest.mock import Mock, patch, MagicMock - -# Add src to path for imports -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) - -from madengine.core.errors import ( - ErrorHandler, - MADEngineError, - ValidationError, - ConfigurationError, - RunnerError, - set_error_handler, - get_error_handler, - create_error_context -) - - -class TestUnifiedErrorSystem: - """Test the unified error handling system integration.""" - - def test_error_system_basic_functionality(self): - """Test basic error system functionality works.""" - # Create error handler - mock_console = Mock() - handler = ErrorHandler(console=mock_console, verbose=False) - - # Create error with context - context = create_error_context( - operation="test_operation", - component="TestComponent", - model_name="test_model" - ) - - error = ValidationError("Test validation error", context=context) - - # Handle the error - handler.handle_error(error) - - # Verify it was handled - mock_console.print.assert_called_once() - - # Verify error structure - assert error.context.operation == "test_operation" - assert error.context.component == "TestComponent" - assert error.recoverable is True - - def test_mad_cli_error_handler_setup(self): - """Test that mad_cli properly sets up error handling.""" - from madengine.cli import setup_logging - - # Clear existing handler - set_error_handler(None) - - # Setup logging - setup_logging(verbose=True) - - # Verify handler was created - handler = get_error_handler() - assert handler is not None - assert isinstance(handler, ErrorHandler) - assert handler.verbose is True - - - def test_runner_error_base_class(self): - """Test that RunnerError base class works properly.""" - context = create_error_context( - operation="runner_test", - component="TestRunner" - ) - - error = RunnerError("Test runner error", context=context) - - assert isinstance(error, MADEngineError) - assert error.recoverable is True - assert error.context.operation == "runner_test" - assert error.context.component == "TestRunner" - - def test_error_context_serialization(self): - """Test that error contexts can be serialized.""" - context = create_error_context( - operation="serialization_test", - component="TestComponent", - model_name="test_model", - node_id="test_node", - additional_info={"key": "value", "number": 42} - ) - - error = ValidationError("Test error", context=context) - - # Test serialization - context_dict = error.context.__dict__ - json_str = json.dumps(context_dict, default=str) - - # Verify content - assert "serialization_test" in json_str - assert "TestComponent" in json_str - assert "test_model" in json_str - assert "test_node" in json_str - assert "key" in json_str - assert "42" in json_str - - def test_error_hierarchy_consistency(self): - """Test that all error types maintain consistent behavior.""" - from madengine.core.errors import ( - ValidationError, ConnectionError, AuthenticationError, - RuntimeError, BuildError, DiscoveryError, OrchestrationError, - RunnerError, ConfigurationError, TimeoutError - ) - - error_classes = [ - ValidationError, ConnectionError, AuthenticationError, - RuntimeError, BuildError, DiscoveryError, OrchestrationError, - RunnerError, ConfigurationError, TimeoutError - ] - - for error_class in error_classes: - error = error_class("Test error message") - - # All should inherit from MADEngineError - assert isinstance(error, MADEngineError) - - # All should have context (even if default) - assert error.context is not None - - # All should have category - assert error.category is not None - - # All should have recoverable flag - assert isinstance(error.recoverable, bool) - - def test_global_error_handler_workflow(self): - """Test the complete global error handler workflow.""" - from madengine.core.errors import handle_error - - # Create and set global handler - mock_console = Mock() - handler = ErrorHandler(console=mock_console, verbose=False) - set_error_handler(handler) - - # Create error - error = ValidationError( - "Global handler test", - context=create_error_context( - operation="global_test", - component="TestGlobalHandler" - ) - ) - - # Use global handle_error function - handle_error(error) - - # Verify it was handled through the global handler - mock_console.print.assert_called_once() - - def test_error_suggestions_and_recovery(self): - """Test error suggestions and recovery information.""" - suggestions = [ - "Check your configuration file", - "Verify network connectivity", - "Try running with --verbose flag" - ] - - error = ConfigurationError( - "Configuration validation failed", - context=create_error_context( - operation="config_validation", - file_path="/path/to/config.json" - ), - suggestions=suggestions - ) - - assert error.suggestions == suggestions - assert error.recoverable is True - assert error.context.file_path == "/path/to/config.json" - - # Test error display includes suggestions - mock_console = Mock() - handler = ErrorHandler(console=mock_console) - handler.handle_error(error) - - # Should have been called to display the error - mock_console.print.assert_called_once() - - def test_nested_error_handling(self): - """Test handling of nested errors with causes.""" - from madengine.core.errors import RuntimeError as MADRuntimeError, OrchestrationError - - # Create a chain of errors - original_error = ConnectionError("Network timeout") - runtime_error = MADRuntimeError("Operation failed", cause=original_error) - final_error = OrchestrationError("Orchestration failed", cause=runtime_error) - - # Test the chain - assert final_error.cause == runtime_error - assert runtime_error.cause == original_error - - # Test handling preserves the chain - mock_console = Mock() - handler = ErrorHandler(console=mock_console, verbose=True) - handler.handle_error(final_error, show_traceback=True) - - # Should display error and potentially traceback - assert mock_console.print.call_count >= 1 - - def test_error_performance(self): - """Test that error handling is performant.""" - import time - - mock_console = Mock() - handler = ErrorHandler(console=mock_console) - - start_time = time.time() - - # Create and handle many errors - for i in range(100): - error = ValidationError( - f"Test error {i}", - context=create_error_context( - operation=f"test_op_{i}", - component="PerformanceTest" - ) - ) - handler.handle_error(error) - - end_time = time.time() - - # Should handle 100 errors in under 1 second - assert end_time - start_time < 1.0 - - # Verify all errors were handled - assert mock_console.print.call_count == 100 - - -class TestErrorSystemBackwardCompatibility: - """Test backward compatibility of the error system.""" - - def test_legacy_exception_handling_still_works(self): - """Test that legacy exception patterns still work.""" - try: - # Simulate old-style exception raising - raise ValueError("Legacy error") - except Exception as e: - # Should be able to handle with new system - mock_console = Mock() - handler = ErrorHandler(console=mock_console) - - context = create_error_context( - operation="legacy_handling", - component="LegacyTest" - ) - - handler.handle_error(e, context=context) - - # Should handle gracefully - mock_console.print.assert_called_once() - - def test_error_system_without_rich(self): - """Test error system fallback when Rich is not available.""" - # This test verifies the system degrades gracefully - # In practice, Rich is a hard dependency, but we test the concept - - with patch('madengine.core.errors.Console', side_effect=ImportError): - # Should still be able to create basic errors - error = ValidationError("Test without Rich") - assert str(error) == "Test without Rich" - assert error.recoverable is True - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/integration/test_errors.py b/tests/integration/test_errors.py new file mode 100644 index 00000000..e325e6ac --- /dev/null +++ b/tests/integration/test_errors.py @@ -0,0 +1,327 @@ +"""Integration tests for error handling: CLI integration, workflow, unified system, backward compat. + +Merged from test_cli_error_integration and test_error_system_integration. +Deduplicated: single setup_logging/handler test, one context serialization test. +""" + +import json +import os +import sys +from unittest.mock import Mock, patch, MagicMock + +import pytest + +# Ensure src on path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +from madengine.core.errors import ( + ErrorHandler, + MADEngineError, + ValidationError, + ConfigurationError, + RunnerError, + set_error_handler, + get_error_handler, + create_error_context, +) + + +# ---- CLI error integration ---- + +class TestCLIErrorIntegration: + """CLI error handling setup and display.""" + + @patch("madengine.cli.utils.Console") + def test_setup_logging_creates_error_handler(self, mock_console_class): + """setup_logging initializes the unified error handler; verbose flag is respected.""" + from madengine.cli import setup_logging + from rich.console import Console + + mock_console = Mock(spec=Console) + mock_console_class.return_value = mock_console + set_error_handler(None) + setup_logging(verbose=True) + handler = get_error_handler() + assert handler is not None and isinstance(handler, ErrorHandler) + assert handler.verbose is True + setup_logging(verbose=False) + assert get_error_handler().verbose is False + + def test_build_command_error_handling(self): + """Build command can use unified error handling (imports and handle_error).""" + from madengine.cli import setup_logging + from madengine.core.errors import handle_error + + setup_logging(verbose=False) + error = Exception("Test build error") + context = create_error_context(operation="build", phase="build", component="CLI") + handle_error(error, context=context) + + @patch("madengine.cli.utils.console") + def test_cli_error_display_consistency(self, mock_console): + """CLI errors go through unified handler and handler has console.""" + from madengine.cli import setup_logging + + setup_logging(verbose=False) + handler = get_error_handler() + error = ConfigurationError( + "Invalid configuration", + context=create_error_context(operation="cli_command", component="CLI", phase="validation"), + ) + handler.handle_error(error) + assert handler.console is not None + + +# ---- Error workflow ---- + +class TestErrorWorkflow: + """End-to-end error flow and logging.""" + + @patch("madengine.cli.utils.console") + def test_end_to_end_error_flow(self, mock_console): + """Error flow from CLI through orchestrator-style error.""" + from madengine.cli import setup_logging + + setup_logging(verbose=True) + handler = get_error_handler() + err = ValidationError( + "Invalid model tag format", + context=create_error_context( + operation="model_discovery", + component="DistributedOrchestrator", + phase="validation", + model_name="invalid::tag", + ), + suggestions=["Use format: model_name:version", "Check model name"], + ) + handler.handle_error(err) + assert err.context.operation == "model_discovery" + assert len(err.suggestions) == 2 + + def test_error_logging_integration(self): + """Errors are logged with structured data.""" + from madengine.cli import setup_logging + from madengine.core.errors import BuildError + + setup_logging(verbose=False) + handler = get_error_handler() + build_error = BuildError( + "Docker build failed", + context=create_error_context( + operation="docker_build", + component="DockerBuilder", + phase="build", + model_name="test_model", + additional_info={"dockerfile": "Dockerfile.ubuntu.amd"}, + ), + suggestions=["Check Dockerfile syntax", "Verify base image availability"], + ) + with patch.object(handler, "logger") as mock_logger: + handler.handle_error(build_error) + mock_logger.error.assert_called_once() + extra = mock_logger.error.call_args[1]["extra"] + assert extra["context"]["operation"] == "docker_build" + assert extra["recoverable"] is False + assert len(extra["suggestions"]) == 2 + + def test_error_context_serialization(self): + """Error context can be serialized for logging.""" + from madengine.core.errors import RuntimeError + + context = create_error_context( + operation="model_execution", + component="ContainerRunner", + phase="runtime", + model_name="llama2", + node_id="worker-node-01", + additional_info={"container_id": "abc123", "gpu_count": 2}, + ) + error = RuntimeError("Model execution failed", context=context) + data = json.dumps(error.context.__dict__, default=str) + assert "model_execution" in data and "ContainerRunner" in data and "abc123" in data + + +# ---- Unified error system ---- + +class TestUnifiedErrorSystem: + """Unified error handling system.""" + + def test_error_system_basic_functionality(self): + """Handler handles ValidationError and context is preserved.""" + mock_console = Mock() + handler = ErrorHandler(console=mock_console, verbose=False) + context = create_error_context( + operation="test_operation", component="TestComponent", model_name="test_model" + ) + error = ValidationError("Test validation error", context=context) + handler.handle_error(error) + mock_console.print.assert_called_once() + assert error.context.operation == "test_operation" + assert error.recoverable is True + + def test_runner_error_base_class(self): + """RunnerError is MADEngineError with recoverable and context.""" + context = create_error_context(operation="runner_test", component="TestRunner") + error = RunnerError("Test runner error", context=context) + assert isinstance(error, MADEngineError) + assert error.recoverable is True + assert error.context.operation == "runner_test" + + def test_error_context_serialization_unified(self): + """Context with many fields serializes for logging.""" + context = create_error_context( + operation="serialization_test", + component="TestComponent", + model_name="test_model", + node_id="test_node", + additional_info={"key": "value", "number": 42}, + ) + error = ValidationError("Test error", context=context) + data = json.dumps(error.context.__dict__, default=str) + assert "serialization_test" in data and "test_node" in data and "42" in data + + def test_error_hierarchy_consistency(self): + """All error types inherit MADEngineError and have context/category/recoverable.""" + from madengine.core.errors import ( + ValidationError, + ConnectionError, + AuthenticationError, + RuntimeError, + BuildError, + DiscoveryError, + OrchestrationError, + RunnerError, + ConfigurationError, + TimeoutError, + ) + + for error_class in [ + ValidationError, + ConnectionError, + AuthenticationError, + RuntimeError, + BuildError, + DiscoveryError, + OrchestrationError, + RunnerError, + ConfigurationError, + TimeoutError, + ]: + err = error_class("Test error message") + assert isinstance(err, MADEngineError) + assert err.context is not None + assert err.category is not None + assert isinstance(err.recoverable, bool) + + def test_global_error_handler_workflow(self): + """handle_error uses global handler when set.""" + from madengine.core.errors import handle_error + + mock_console = Mock() + handler = ErrorHandler(console=mock_console, verbose=False) + set_error_handler(handler) + error = ValidationError( + "Global handler test", + context=create_error_context(operation="global_test", component="TestGlobalHandler"), + ) + handle_error(error) + mock_console.print.assert_called_once() + + def test_error_suggestions_and_recovery(self): + """Suggestions and context are stored and displayed.""" + suggestions = ["Check config", "Verify network", "Try --verbose"] + error = ConfigurationError( + "Configuration validation failed", + context=create_error_context( + operation="config_validation", file_path="/path/to/config.json" + ), + suggestions=suggestions, + ) + assert error.suggestions == suggestions + assert error.context.file_path == "/path/to/config.json" + mock_console = Mock() + ErrorHandler(console=mock_console).handle_error(error) + mock_console.print.assert_called_once() + + def test_nested_error_handling(self): + """Nested errors with cause chain are handled.""" + from madengine.core.errors import RuntimeError as MADRuntimeError, OrchestrationError + + orig = ConnectionError("Network timeout") + runtime = MADRuntimeError("Operation failed", cause=orig) + final = OrchestrationError("Orchestration failed", cause=runtime) + assert final.cause == runtime and runtime.cause == orig + mock_console = Mock() + ErrorHandler(console=mock_console, verbose=True).handle_error(final, show_traceback=True) + assert mock_console.print.call_count >= 1 + + def test_error_performance(self): + """Handle 100 errors in under 1 second.""" + import time + + mock_console = Mock() + handler = ErrorHandler(console=mock_console) + start = time.time() + for i in range(100): + err = ValidationError( + f"Test error {i}", + context=create_error_context(operation=f"test_op_{i}", component="PerformanceTest"), + ) + handler.handle_error(err) + assert time.time() - start < 1.0 + assert mock_console.print.call_count == 100 + + +# ---- Performance (lightweight) ---- + +class TestErrorHandlingPerformance: + """Error handler and context creation performance.""" + + def test_error_handler_initialization_performance(self): + """Create 100 handlers in under 1 second.""" + import time + from rich.console import Console + + start = time.time() + for _ in range(100): + ErrorHandler(console=Console(), verbose=False) + assert time.time() - start < 1.0 + + def test_error_context_creation_performance(self): + """Create 1000 contexts in under 0.1 seconds.""" + import time + + start = time.time() + for i in range(1000): + create_error_context( + operation=f"op_{i}", component=f"C_{i}", phase="test", model_name=f"m_{i}" + ) + assert time.time() - start < 0.1 + + +# ---- Backward compatibility ---- + +class TestErrorSystemBackwardCompatibility: + """Backward compatibility of the error system.""" + + def test_legacy_exception_handling_still_works(self): + """Legacy ValueError can be handled via handle_error with context.""" + try: + raise ValueError("Legacy error") + except Exception as e: + mock_console = Mock() + handler = ErrorHandler(console=mock_console) + context = create_error_context(operation="legacy_handling", component="LegacyTest") + handler.handle_error(e, context=context) + mock_console.print.assert_called_once() + + def test_error_system_without_rich(self): + """Errors can be created and have str/recoverable when Rich is unavailable.""" + with patch("madengine.core.errors.Console", side_effect=ImportError): + error = ValidationError("Test without Rich") + assert "Test without Rich" in str(error) + assert error.recoverable is True + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/integration/test_multi_gpu_arch.py b/tests/integration/test_multi_gpu_arch.py deleted file mode 100644 index 339d3fb9..00000000 --- a/tests/integration/test_multi_gpu_arch.py +++ /dev/null @@ -1,194 +0,0 @@ -"""Comprehensive unit tests for multi-GPU architecture support in madengine. - -Covers: -- Multi-arch DockerBuilder logic (image naming, manifest, legacy/override) -- Dockerfile GPU variable parsing/validation -- Target architecture normalization and compatibility -- Run-phase manifest filtering by gpu_architecture - -UPDATED: Now uses BuildOrchestrator instead of deprecated DistributedOrchestrator. - -All tests are logic/unit tests and do not require GPU hardware. -""" -import pytest -from unittest.mock import MagicMock, patch -from madengine.execution.docker_builder import DockerBuilder -from madengine.orchestration.build_orchestrator import BuildOrchestrator -from madengine.orchestration.run_orchestrator import RunOrchestrator - -class TestMultiGPUArch: - def setup_method(self): - self.context = MagicMock() - self.console = MagicMock() - self.builder = DockerBuilder(self.context, self.console) - - # Mock args for BuildOrchestrator (replacement for DistributedOrchestrator) - mock_args = MagicMock() - mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' - mock_args.additional_context_file = None - mock_args.live_output = True - mock_args.data_config_file_name = "data.json" - mock_args.tags = [] - mock_args.target_archs = [] - mock_args.force_mirror_local = None - - # Create BuildOrchestrator with mocked args - self.orchestrator = BuildOrchestrator(mock_args) - - # --- DockerBuilder Multi-Arch Logic --- - @patch.object(DockerBuilder, "_get_dockerfiles_for_model") - @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") - @patch.object(DockerBuilder, "build_image") - def test_multi_arch_build_image_naming(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): - model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} - mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] - # GPU variable present - mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) - assert result[0]["docker_image"].endswith("_gfx908") - # GPU variable absent - mock_check_gpu_vars.return_value = (False, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) - assert not result[0]["docker_image"].endswith("_gfx908") - - @patch.object(DockerBuilder, "_get_dockerfiles_for_model") - @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") - @patch.object(DockerBuilder, "build_image") - def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): - model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} - mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] - mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} - result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) - assert result[0]["gpu_architecture"] == "gfx908" - - @patch.object(DockerBuilder, "_get_dockerfiles_for_model") - @patch.object(DockerBuilder, "build_image") - def test_legacy_single_arch_build(self, mock_build_image, mock_get_dockerfiles): - model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} - mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] - mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} - result = self.builder._build_model_single_arch(model_info, None, False, None, "", None) - assert result[0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" - - @patch.object(DockerBuilder, "_build_model_single_arch") - def test_additional_context_overrides_target_archs(self, mock_single_arch): - self.context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"}} - model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} - mock_single_arch.return_value = [{"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0}] - result = self.builder.build_all_models([model_info], target_archs=["gfx908", "gfx90a"]) - assert result["successful_builds"][0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" - - # --- Dockerfile GPU Variable Parsing/Validation --- - def test_parse_dockerfile_gpu_variables(self): - dockerfile_content = """ - ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx908 - ENV PYTORCH_ROCM_ARCH=gfx908;gfx90a - ARG GPU_TARGETS=gfx908,gfx942 - ENV GFX_COMPILATION_ARCH=gfx908 - ARG GPU_ARCHS=gfx908;gfx90a;gfx942 - """ - result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) - assert result["MAD_SYSTEM_GPU_ARCHITECTURE"] == ["gfx908"] - assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] - assert result["GPU_TARGETS"] == ["gfx908", "gfx942"] - assert result["GFX_COMPILATION_ARCH"] == ["gfx908"] - assert result["GPU_ARCHS"] == ["gfx908", "gfx90a", "gfx942"] - - def test_parse_dockerfile_gpu_variables_env_delimiter(self): - dockerfile_content = "ENV PYTORCH_ROCM_ARCH = gfx908,gfx90a" - result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) - assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] - - def test_parse_malformed_dockerfile(self): - dockerfile_content = "ENV BAD_LINE\nARG MAD_SYSTEM_GPU_ARCHITECTURE=\nENV PYTORCH_ROCM_ARCH=\n" - result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) - assert isinstance(result, dict) - - # --- Target Architecture Normalization/Compatibility --- - def test_normalize_architecture_name(self): - cases = { - "gfx908": "gfx908", - "GFX908": "gfx908", - "mi100": "gfx908", - "mi-100": "gfx908", - "mi200": "gfx90a", - "mi-200": "gfx90a", - "mi210": "gfx90a", - "mi250": "gfx90a", - "mi300": "gfx940", - "mi-300": "gfx940", - "mi300a": "gfx940", - "mi300x": "gfx942", - "mi-300x": "gfx942", - "unknown": "unknown", - "": None, - } - for inp, expected in cases.items(): - assert self.builder._normalize_architecture_name(inp) == expected - - def test_is_target_arch_compatible_with_variable(self): - assert self.builder._is_target_arch_compatible_with_variable("MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942") - assert self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942") - assert not self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942") - assert self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx908") - assert not self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx942") - assert self.builder._is_target_arch_compatible_with_variable("UNKNOWN_VAR", ["foo"], "bar") - - def test_is_compilation_arch_compatible(self): - assert self.builder._is_compilation_arch_compatible("gfx908", "gfx908") - assert not self.builder._is_compilation_arch_compatible("gfx908", "gfx942") - assert self.builder._is_compilation_arch_compatible("foo", "foo") - - # --- Run-Phase Manifest Filtering --- - def test_filter_images_by_gpu_architecture(self): - """Test image filtering by GPU architecture using RunOrchestrator. - - Note: Current behavior treats images without gpu_vendor as compatible (legacy support). - """ - # Create RunOrchestrator which has _filter_images_by_gpu_architecture - mock_args = MagicMock() - mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' - mock_args.additional_context_file = None - mock_args.tags = [] - mock_args.live_output = True - mock_args.data_config_file_name = "data.json" - mock_args.force_mirror_local = None - - run_orch = RunOrchestrator(mock_args) - - # Test exact match - both images have gpu_vendor set to "AMD" - built_images = { - "img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, - "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"} - } - filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") - assert "img1" in filtered and "img2" not in filtered - - # Test legacy image (no gpu_vendor field) - should be included for compatibility - built_images = { - "img1": {"gpu_architecture": "gfx908"}, # No gpu_vendor - "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"} - } - filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") - # Current behavior: legacy images (no gpu_vendor) are treated as compatible - assert "img1" in filtered # Legacy image included - # img2 may or may not be included depending on gpu_vendor matching logic - - # Test no match case with explicit gpu_vendor - built_images = { - "img1": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}, - "img2": {"gpu_architecture": "gfx942", "gpu_vendor": "AMD"} - } - filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") - assert len(filtered) == 0 - - # Test all matching case with gpu_vendor - built_images = { - "img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, - "img2": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"} - } - filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") - assert len(filtered) == 2 diff --git a/tests/integration/test_orchestrator_workflows.py b/tests/integration/test_orchestrator_workflows.py index 78d9636b..12aa1a91 100644 --- a/tests/integration/test_orchestrator_workflows.py +++ b/tests/integration/test_orchestrator_workflows.py @@ -1,7 +1,7 @@ """Test the orchestration layer modules. This module tests the Build and Run orchestrators that coordinate -the build and execution workflows. +the build and execution workflows, and batch manifest CLI behavior. Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ @@ -14,13 +14,49 @@ # third-party modules import pytest +from typer.testing import CliRunner # project modules +from madengine.cli import app from madengine.orchestration.build_orchestrator import BuildOrchestrator from madengine.orchestration.run_orchestrator import RunOrchestrator from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError +# ============================================================================ +# Batch manifest (CLI build options) +# ============================================================================ + +class TestBatchManifestBuildIntegration: + """Batch manifest and --tags are mutually exclusive.""" + + def test_batch_manifest_mutually_exclusive_with_tags(self): + """--batch-manifest and --tags cannot be used together.""" + runner = CliRunner() + batch_data = [{"model_name": "dummy", "build_new": True}] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + batch_file = f.name + try: + result = runner.invoke( + app, + [ + "build", + "--batch-manifest", batch_file, + "--tags", "dummy", + "--additional-context", '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}', + ], + ) + assert result.exit_code != 0 + assert "Cannot specify both --batch-manifest and --tags" in result.output + finally: + os.unlink(batch_file) + + +# ============================================================================ +# Build orchestrator +# ============================================================================ + class TestBuildOrchestrator: """Test the Build Orchestrator module.""" diff --git a/tests/integration/test_platform_integration.py b/tests/integration/test_platform_integration.py index be0fe770..4f24d0c7 100644 --- a/tests/integration/test_platform_integration.py +++ b/tests/integration/test_platform_integration.py @@ -14,6 +14,13 @@ from unittest.mock import MagicMock, patch, mock_open import pytest +from madengine.execution.docker_builder import DockerBuilder +from madengine.execution.dockerfile_utils import ( + parse_dockerfile_gpu_variables, + normalize_architecture_name, + is_target_arch_compatible_with_variable, + is_compilation_arch_compatible, +) from madengine.orchestration.build_orchestrator import BuildOrchestrator from madengine.orchestration.run_orchestrator import RunOrchestrator from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError @@ -543,6 +550,139 @@ def test_cpu_only_execution(self, cpu_context, mock_run_args, temp_manifest_file assert orchestrator.context.get_system_ngpus() == 0 +# ============================================================================ +# Multi-GPU architecture (Dockerfile parsing, normalization, image filtering) +# ============================================================================ + +class TestMultiGPUArch: + """Multi-arch DockerBuilder logic, dockerfile_utils, and run-phase image filtering.""" + + def setup_method(self): + self.context = MagicMock() + self.console = MagicMock() + self.builder = DockerBuilder(self.context, self.console) + mock_args = MagicMock() + mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + mock_args.additional_context_file = None + mock_args.live_output = True + mock_args.data_config_file_name = "data.json" + mock_args.tags = [] + mock_args.target_archs = [] + mock_args.force_mirror_local = None + self.orchestrator = BuildOrchestrator(mock_args) + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_build_image_naming(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["docker_image"].endswith("_gfx908") + mock_check_gpu_vars.return_value = (False, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert not result[0]["docker_image"].endswith("_gfx908") + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["gpu_architecture"] == "gfx908" + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "build_image") + def test_legacy_single_arch_build(self, mock_build_image, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_single_arch(model_info, None, False, None, "", None) + assert result[0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + @patch.object(DockerBuilder, "_build_model_single_arch") + def test_additional_context_overrides_target_archs(self, mock_single_arch): + self.context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"}} + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_single_arch.return_value = [{"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0}] + result = self.builder.build_all_models([model_info], target_archs=["gfx908", "gfx90a"]) + assert result["successful_builds"][0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + def test_parse_dockerfile_gpu_variables(self): + content = """ + ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx908 + ENV PYTORCH_ROCM_ARCH=gfx908;gfx90a + ARG GPU_TARGETS=gfx908,gfx942 + ENV GFX_COMPILATION_ARCH=gfx908 + ARG GPU_ARCHS=gfx908;gfx90a;gfx942 + """ + result = parse_dockerfile_gpu_variables(content) + assert result["MAD_SYSTEM_GPU_ARCHITECTURE"] == ["gfx908"] + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + assert result["GPU_TARGETS"] == ["gfx908", "gfx942"] + assert result["GFX_COMPILATION_ARCH"] == ["gfx908"] + assert result["GPU_ARCHS"] == ["gfx908", "gfx90a", "gfx942"] + + def test_parse_dockerfile_gpu_variables_env_delimiter(self): + result = parse_dockerfile_gpu_variables("ENV PYTORCH_ROCM_ARCH = gfx908,gfx90a") + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + + def test_parse_malformed_dockerfile(self): + result = parse_dockerfile_gpu_variables( + "ENV BAD_LINE\nARG MAD_SYSTEM_GPU_ARCHITECTURE=\nENV PYTORCH_ROCM_ARCH=\n" + ) + assert isinstance(result, dict) + + def test_normalize_architecture_name(self): + cases = { + "gfx908": "gfx908", "GFX908": "gfx908", "mi100": "gfx908", "mi-100": "gfx908", + "mi200": "gfx90a", "mi-200": "gfx90a", "mi210": "gfx90a", "mi250": "gfx90a", + "mi300": "gfx940", "mi-300": "gfx940", "mi300a": "gfx940", + "mi300x": "gfx942", "mi-300x": "gfx942", "unknown": "unknown", "": None, + } + for inp, expected in cases.items(): + assert normalize_architecture_name(inp) == expected + + def test_is_target_arch_compatible_with_variable(self): + assert is_target_arch_compatible_with_variable("MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942") + assert is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942") + assert not is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942") + assert is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx908") + assert not is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx942") + assert is_target_arch_compatible_with_variable("UNKNOWN_VAR", ["foo"], "bar") + + def test_is_compilation_arch_compatible(self): + assert is_compilation_arch_compatible("gfx908", "gfx908") + assert not is_compilation_arch_compatible("gfx908", "gfx942") + assert is_compilation_arch_compatible("foo", "foo") + + def test_filter_images_by_gpu_architecture(self): + mock_args = MagicMock() + mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + mock_args.additional_context_file = None + mock_args.tags = [] + mock_args.live_output = True + mock_args.data_config_file_name = "data.json" + mock_args.force_mirror_local = None + run_orch = RunOrchestrator(mock_args) + built = {"img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}} + filtered = run_orch._filter_images_by_gpu_architecture(built, "gfx908") + assert "img1" in filtered and "img2" not in filtered + built_legacy = {"img1": {"gpu_architecture": "gfx908"}, "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}} + filtered = run_orch._filter_images_by_gpu_architecture(built_legacy, "gfx908") + assert "img1" in filtered + built_nomatch = {"img1": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}, "img2": {"gpu_architecture": "gfx942", "gpu_vendor": "AMD"}} + assert len(run_orch._filter_images_by_gpu_architecture(built_nomatch, "gfx908")) == 0 + built_all = {"img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, "img2": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}} + assert len(run_orch._filter_images_by_gpu_architecture(built_all, "gfx908")) == 2 + + if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"]) diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py deleted file mode 100644 index 458c4e4a..00000000 --- a/tests/test_cleanup.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Test cleanup functionality for robust directory removal.""" - -import unittest -from unittest.mock import Mock, patch, call, MagicMock -import time -from madengine.tools.run_models import RunModels - - -class TestCleanupModelDirectory(unittest.TestCase): - """Test cases for the _cleanup_model_directory method.""" - - def setUp(self): - """Set up test fixtures.""" - # Create a mock args object with all required attributes - self.mock_args = Mock() - self.mock_args.keep_alive = False - self.mock_args.keep_model_dir = False - self.mock_args.generate_sys_env_details = False - self.mock_args.data_config_file_name = "/tmp/nonexistent_data.json" # Use non-existent path - self.mock_args.additional_context = "" - self.mock_args.additional_context_file = None - self.mock_args.force_mirror_local = False - - # Patch the dependencies before creating RunModels instance - with patch('madengine.tools.run_models.Console'), \ - patch('madengine.tools.run_models.Context') as mock_context_cls: - # Setup Context mock - mock_context = MagicMock() - mock_context.ctx = {} - mock_context_cls.return_value = mock_context - - self.run_models = RunModels(self.mock_args) - - # Create mock docker instance - self.mock_docker = Mock() - - def test_cleanup_success_first_attempt(self): - """Test successful cleanup on first attempt.""" - model_dir = "test_model_dir" - - # Mock successful removal - self.mock_docker.sh.return_value = "" - - # Call cleanup method - self.run_models._cleanup_model_directory(self.mock_docker, model_dir) - - # Verify rm command was called - self.mock_docker.sh.assert_called_with(f"rm -rf {model_dir}", timeout=240) - # Should only be called once on success - self.assertEqual(self.mock_docker.sh.call_count, 1) - - def test_cleanup_success_after_retries(self): - """Test successful cleanup after retries.""" - model_dir = "test_model_dir" - - # Mock failure on first 2 attempts, success on 3rd - self.mock_docker.sh.side_effect = [ - RuntimeError("Directory not empty"), # First rm -rf fails - RuntimeError("Directory not empty"), # fuser command - RuntimeError("Directory not empty"), # chmod command - RuntimeError("Directory not empty"), # Second rm -rf fails - RuntimeError("Directory not empty"), # fuser command - RuntimeError("Directory not empty"), # chmod command - "", # Third rm -rf succeeds - ] - - # Call cleanup method with shorter retry delay for testing - with patch('time.sleep'): # Mock sleep to speed up test - self.run_models._cleanup_model_directory( - self.mock_docker, model_dir, max_retries=3, retry_delay=0.1 - ) - - # Verify multiple attempts were made - self.assertGreater(self.mock_docker.sh.call_count, 1) - - def test_cleanup_all_attempts_fail_no_exception(self): - """Test that cleanup failure doesn't raise exception (only logs warning).""" - model_dir = "test_model_dir" - - # Mock all attempts failing - self.mock_docker.sh.side_effect = RuntimeError("Directory not empty") - - # Call cleanup method - should NOT raise exception - with patch('time.sleep'): # Mock sleep to speed up test - try: - self.run_models._cleanup_model_directory( - self.mock_docker, model_dir, max_retries=2, retry_delay=0.1 - ) - # Should complete without raising exception - cleanup_succeeded = True - except Exception as e: - cleanup_succeeded = False - self.fail(f"Cleanup should not raise exception, but raised: {e}") - - self.assertTrue(cleanup_succeeded, "Cleanup should complete even if all attempts fail") - - def test_cleanup_uses_fuser_and_chmod_on_retry(self): - """Test that retry attempts use fuser and chmod.""" - model_dir = "test_model_dir" - - # Track the commands called - commands_called = [] - - def track_commands(cmd, timeout): - commands_called.append(cmd) - if "rm -rf" in cmd and len([c for c in commands_called if "rm -rf" in c]) == 1: - # Fail first rm -rf - raise RuntimeError("Directory not empty") - return "" - - self.mock_docker.sh.side_effect = track_commands - - # Call cleanup method - with patch('time.sleep'): # Mock sleep to speed up test - self.run_models._cleanup_model_directory( - self.mock_docker, model_dir, max_retries=2, retry_delay=0.1 - ) - - # Verify fuser and chmod were called on retry - command_strings = ' '.join(commands_called) - self.assertIn('fuser', command_strings, "fuser should be called on retry") - self.assertIn('chmod', command_strings, "chmod should be called on retry") - - def test_cleanup_with_custom_retry_params(self): - """Test cleanup with custom retry parameters.""" - model_dir = "test_model_dir" - custom_retries = 5 - custom_delay = 0.5 - - self.mock_docker.sh.return_value = "" - - # Call with custom parameters - self.run_models._cleanup_model_directory( - self.mock_docker, model_dir, - max_retries=custom_retries, - retry_delay=custom_delay - ) - - # Verify it worked - self.mock_docker.sh.assert_called() - - -class TestCleanupIntegration(unittest.TestCase): - """Integration tests for cleanup in run_model_impl.""" - - def setUp(self): - """Set up test fixtures.""" - self.mock_args = Mock() - self.mock_args.keep_alive = False - self.mock_args.keep_model_dir = False - self.mock_args.generate_sys_env_details = False - self.mock_args.skip_model_run = True - self.mock_args.data_config_file_name = "/tmp/nonexistent_data.json" - self.mock_args.additional_context = "" - self.mock_args.additional_context_file = None - self.mock_args.force_mirror_local = False - - with patch('madengine.tools.run_models.Console'), \ - patch('madengine.tools.run_models.Context') as mock_context_cls: - mock_context = MagicMock() - mock_context.ctx = {} - mock_context_cls.return_value = mock_context - self.run_models = RunModels(self.mock_args) - - @patch('madengine.tools.run_models.RunModels._cleanup_model_directory') - def test_cleanup_called_when_not_keep_alive(self, mock_cleanup): - """Test that cleanup is called when keep_alive is False.""" - # This test verifies that our new method is called instead of direct rm -rf - # We can't easily test the full run_model_impl, but we've verified the code change - self.assertTrue(hasattr(self.run_models, '_cleanup_model_directory')) - - # Verify the method exists and is callable - self.assertTrue(callable(self.run_models._cleanup_model_directory)) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/unit/test_config_loader.py b/tests/unit/test_config_loader.py index 7e68833c..65184e68 100644 --- a/tests/unit/test_config_loader.py +++ b/tests/unit/test_config_loader.py @@ -19,7 +19,7 @@ from jinja2 import Template -from madengine.deployment.config_loader import ConfigLoader +from madengine.deployment.config_loader import ConfigLoader, apply_deployment_config from madengine.deployment.slurm import SlurmDeployment @@ -156,6 +156,30 @@ def test_override_behavior(self): assert "OMP_NUM_THREADS" in result["env_vars"] # Default still there +class TestApplyDeploymentConfig: + """Test apply_deployment_config helper.""" + + def test_apply_slurm_config_mutates_and_returns(self): + """apply_deployment_config mutates config.additional_context and returns full config.""" + class FakeConfig: + additional_context = {"slurm": {"nodes": 2}} + config = FakeConfig() + result = apply_deployment_config(config, ConfigLoader.load_slurm_config) + assert result is config.additional_context + assert "slurm" in result + assert result["slurm"]["nodes"] == 2 + + def test_apply_k8s_config_mutates_and_returns(self): + """apply_deployment_config with load_k8s_config mutates and returns full config.""" + class FakeConfig: + additional_context = {"k8s": {"gpu_count": 1}} + config = FakeConfig() + result = apply_deployment_config(config, ConfigLoader.load_k8s_config) + assert result is config.additional_context + assert "k8s" in result + assert result["k8s"]["gpu_count"] == 1 + + class TestConfigLoaderK8sConfigs: """Test with actual K8s config files (if they exist).""" diff --git a/tests/unit/test_constants.py b/tests/unit/test_constants.py new file mode 100644 index 00000000..a7230369 --- /dev/null +++ b/tests/unit/test_constants.py @@ -0,0 +1,105 @@ +"""Unit tests for core constants and _get_env_or_creds_or_default.""" + +import json +import os +from unittest.mock import patch + +import pytest + +from madengine.core.constants import ( + NAS_NODES, + MAD_AWS_S3, + MAD_MINIO, + PUBLIC_GITHUB_ROCM_KEY, + _get_env_or_creds_or_default, + _DEFAULT_NAS_NODES, + _DEFAULT_MAD_AWS_S3, + _DEFAULT_MAD_MINIO, + _DEFAULT_PUBLIC_GITHUB_ROCM_KEY, +) + + +class TestGetEnvOrCredsOrDefault: + """Test _get_env_or_creds_or_default helper.""" + + def test_env_override_returns_parsed_json(self): + """When env is set with valid JSON, that value is returned.""" + with patch.dict(os.environ, {"TEST_KEY": '[{"a": 1}]'}, clear=False): + # Need to pass creds - we patch CREDS via the module + import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {}): + result = _get_env_or_creds_or_default( + "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES + ) + assert result == [{"a": 1}] + + def test_env_invalid_json_returns_default(self): + """When env is set with invalid JSON, default is returned.""" + with patch.dict(os.environ, {"TEST_KEY": "not json"}, clear=False): + import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {}): + result = _get_env_or_creds_or_default( + "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES + ) + assert result == _DEFAULT_NAS_NODES + + def test_creds_fallback_when_env_unset(self): + """When env is unset and creds has key, creds value is returned.""" + with patch.dict(os.environ, {}, clear=False): + try: + orig = os.environ.get("TEST_KEY") + if "TEST_KEY" in os.environ: + del os.environ["TEST_KEY"] + except Exception: + pass + import madengine.core.constants as constants_module + with patch.object(constants_module, "CREDS", {"TEST_KEY": [{"from": "creds"}]}): + result = _get_env_or_creds_or_default( + "TEST_KEY", "TEST_KEY", _DEFAULT_NAS_NODES + ) + assert result == [{"from": "creds"}] + + def test_default_when_env_and_creds_unset(self): + """When env unset and creds missing key, default is returned.""" + import madengine.core.constants as constants_module + with patch.dict(os.environ, {}, clear=False): + if "TEST_KEY" in os.environ: + del os.environ["TEST_KEY"] + with patch.object(constants_module, "CREDS", {}): + result = _get_env_or_creds_or_default( + "TEST_KEY", "TEST_KEY", _DEFAULT_MAD_AWS_S3 + ) + assert result == _DEFAULT_MAD_AWS_S3 + + +class TestConstantsPublicAPI: + """Test that the four constants still expose correct shape (smoke).""" + + def test_nas_nodes_is_list_of_dicts(self): + """NAS_NODES is a list of node config dicts.""" + assert isinstance(NAS_NODES, list) + assert len(NAS_NODES) >= 1 + for node in NAS_NODES: + assert isinstance(node, dict) + assert "NAME" in node or "HOST" in node + + def test_mad_aws_s3_has_expected_keys(self): + """MAD_AWS_S3 has USERNAME and PASSWORD.""" + assert isinstance(MAD_AWS_S3, dict) + assert "USERNAME" in MAD_AWS_S3 + assert "PASSWORD" in MAD_AWS_S3 + + def test_mad_minio_has_expected_keys(self): + """MAD_MINIO has USERNAME, PASSWORD, MINIO_ENDPOINT, AWS_ENDPOINT_URL_S3.""" + assert isinstance(MAD_MINIO, dict) + assert "USERNAME" in MAD_MINIO + assert "PASSWORD" in MAD_MINIO + assert "MINIO_ENDPOINT" in MAD_MINIO + assert "AWS_ENDPOINT_URL_S3" in MAD_MINIO + + def test_public_github_rocm_key_has_expected_keys(self): + """PUBLIC_GITHUB_ROCM_KEY has username and token (no value assert to avoid leaking secrets).""" + assert isinstance(PUBLIC_GITHUB_ROCM_KEY, dict) + assert set(PUBLIC_GITHUB_ROCM_KEY.keys()) >= {"username", "token"}, ( + "PUBLIC_GITHUB_ROCM_KEY must have at least keys 'username' and 'token'" + ) diff --git a/tests/unit/test_deployment.py b/tests/unit/test_deployment.py new file mode 100644 index 00000000..1f031650 --- /dev/null +++ b/tests/unit/test_deployment.py @@ -0,0 +1,133 @@ +"""Unit tests for deployment: base (create_jinja_env) and common (launcher, rocprof, profiling).""" + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from madengine.deployment.base import create_jinja_env +from madengine.deployment.common import ( + VALID_LAUNCHERS, + configure_multi_node_profiling, + is_rocprofv3_available, + normalize_launcher, +) + + +# ---- deployment.base (create_jinja_env) ---- + +class TestCreateJinjaEnv: + """Test create_jinja_env helper.""" + + def test_returns_environment_with_dirname_basename_filters(self): + """create_jinja_env returns Environment with dirname and basename filters.""" + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) + (p / "test.j2").write_text("dir={{ path | dirname }} name={{ path | basename }}") + env = create_jinja_env(p) + template = env.get_template("test.j2") + out = template.render(path="/foo/bar/baz.txt") + assert "dir=/foo/bar" in out or "dir=foo/bar" in out + assert "name=baz.txt" in out + + def test_template_dir_must_exist(self): + """create_jinja_env works when template_dir exists.""" + with tempfile.TemporaryDirectory() as tmpdir: + env = create_jinja_env(Path(tmpdir)) + assert env is not None + assert env.filters.get("dirname") is not None + assert env.filters.get("basename") is not None + + +# ---- deployment.common ---- + +class TestValidLaunchers: + """VALID_LAUNCHERS constant.""" + + def test_contains_expected_launchers(self): + assert "torchrun" in VALID_LAUNCHERS + assert "vllm" in VALID_LAUNCHERS + assert "sglang-disagg" in VALID_LAUNCHERS + + +class TestNormalizeLauncher: + """normalize_launcher behavior.""" + + def test_valid_launcher_passthrough(self): + for lt in VALID_LAUNCHERS: + assert normalize_launcher(lt, "kubernetes") == lt + assert normalize_launcher(lt, "slurm") == lt + assert normalize_launcher(lt, "local") == lt + + @pytest.mark.parametrize("launcher", [None, "", "invalid"]) + def test_invalid_or_missing_launcher_kubernetes_returns_native(self, launcher): + assert normalize_launcher(launcher, "kubernetes") == "native" + + @pytest.mark.parametrize("deployment", ["slurm", "local", "unknown"]) + def test_invalid_or_missing_launcher_non_k8s_returns_docker(self, deployment): + assert normalize_launcher(None, deployment) == "docker" + + +class TestIsRocprofv3Available: + """is_rocprofv3_available (mocked subprocess).""" + + def test_returns_true_when_help_succeeds(self): + with patch("madengine.deployment.common.subprocess.run") as m: + m.return_value = MagicMock(returncode=0) + assert is_rocprofv3_available() is True + m.assert_called_once() + assert m.call_args[0][0] == ["rocprofv3", "--help"] + + def test_returns_false_when_not_found(self): + with patch("madengine.deployment.common.subprocess.run") as m: + m.side_effect = FileNotFoundError() + assert is_rocprofv3_available() is False + + def test_returns_false_on_timeout(self): + import subprocess + with patch("madengine.deployment.common.subprocess.run") as m: + m.side_effect = subprocess.TimeoutExpired("rocprofv3", 5) + assert is_rocprofv3_available() is False + + +class TestConfigureMultiNodeProfiling: + """configure_multi_node_profiling with mocked is_rocprofv3_available.""" + + def test_single_node_returns_tools_unchanged(self): + logger = MagicMock() + tools = [{"name": "rocprof"}] + out = configure_multi_node_profiling(1, tools, logger) + assert out["enabled"] is True + assert out["mode"] == "single_node" + assert out["tools"] is tools + assert out["per_node_collection"] is False + + @patch("madengine.deployment.common.is_rocprofv3_available", return_value=False) + def test_multi_node_no_rocprofv3_returns_disabled(self, _mock_avail): + logger = MagicMock() + out = configure_multi_node_profiling(2, [{"name": "rocprof"}], logger) + assert out["enabled"] is False + assert out["mode"] == "multi_node_unsupported" + assert out["tools"] == [] + logger.warning.assert_called_once() + + @patch("madengine.deployment.common.is_rocprofv3_available", return_value=True) + def test_multi_node_upgrades_rocprof_to_rocprofv3(self, _mock_avail): + logger = MagicMock() + tools = [{"name": "rocprof", "args": ["--trace"]}] + out = configure_multi_node_profiling(2, tools, logger) + assert out["enabled"] is True + assert out["mode"] == "multi_node" + assert out["per_node_collection"] is True + assert len(out["tools"]) == 1 + assert out["tools"][0]["name"] == "rocprofv3" + assert out["tools"][0]["args"] == ["--trace"] + + @patch("madengine.deployment.common.is_rocprofv3_available", return_value=True) + def test_multi_node_other_tools_unchanged(self, _mock_avail): + logger = MagicMock() + tools = [{"name": "rccl_trace"}, {"name": "rocprof"}] + out = configure_multi_node_profiling(2, tools, logger) + assert out["tools"][0]["name"] == "rccl_trace" + assert out["tools"][1]["name"] == "rocprofv3" diff --git a/tests/unit/test_execution.py b/tests/unit/test_execution.py new file mode 100644 index 00000000..a271d937 --- /dev/null +++ b/tests/unit/test_execution.py @@ -0,0 +1,154 @@ +"""Unit tests for execution: container_runner_helpers and dockerfile_utils.""" + +import pytest + +from madengine.execution.container_runner_helpers import ( + make_run_log_file_path, + resolve_run_timeout, +) +from madengine.execution.dockerfile_utils import ( + GPU_ARCH_VARIABLES, + is_compilation_arch_compatible, + is_target_arch_compatible_with_variable, + normalize_architecture_name, + parse_dockerfile_gpu_variables, +) + + +# ---- container_runner_helpers ---- + +class TestResolveRunTimeout: + """resolve_run_timeout behavior.""" + + def test_model_timeout_used_when_cli_default(self): + assert resolve_run_timeout({"timeout": 3600}, 7200) == 3600 + assert resolve_run_timeout({"timeout": 100}, 7200) == 100 + + def test_cli_timeout_used_when_explicit(self): + assert resolve_run_timeout({"timeout": 3600}, 6000) == 6000 + assert resolve_run_timeout({"timeout": 3600}, 100) == 100 + + def test_cli_default_returned_when_no_model_timeout(self): + assert resolve_run_timeout({}, 7200) == 7200 + assert resolve_run_timeout({"name": "x"}, 3600) == 3600 + + @pytest.mark.parametrize("model_timeout", [None, 0]) + def test_falsy_model_timeout_ignored_uses_cli(self, model_timeout): + assert resolve_run_timeout({"timeout": model_timeout}, 7200) == 7200 + + def test_custom_default_cli(self): + assert resolve_run_timeout({"timeout": 100}, 5000, default_cli_timeout=5000) == 100 + assert resolve_run_timeout({"timeout": 100}, 7200, default_cli_timeout=5000) == 7200 + + +class TestMakeRunLogFilePath: + """make_run_log_file_path behavior.""" + + def test_basic_format(self): + out = make_run_log_file_path( + {"name": "org/model"}, "ci-org_model_ubuntu.22.04", "", + ) + assert out == "org_model_ubuntu.22.04.live.log" + + def test_phase_suffix_appended(self): + out = make_run_log_file_path({"name": "a/b"}, "ci-a_b_cuda", ".run") + assert out == "a_b_cuda.run.live.log" + + def test_slashes_in_model_name_replaced(self): + out = make_run_log_file_path( + {"name": "foo/bar/baz"}, "ci-foo_bar_baz_ubuntu", "", + ) + assert "/" not in out + assert out.endswith(".live.log") + + def test_image_without_ci_prefix(self): + out = make_run_log_file_path({"name": "x/y"}, "registry/x_y_tag", "") + assert "registry_x_y_tag" in out or "x_y" in out + assert out.endswith(".live.log") + + def test_no_model_prefix_in_image(self): + out = make_run_log_file_path( + {"name": "other/model"}, "ci-some_ubuntu_22", "", + ) + assert out == "other_model_some_ubuntu_22.live.log" + + +# ---- dockerfile_utils ---- + +class TestGpuArchVariables: + def test_contains_expected_vars(self): + assert "MAD_SYSTEM_GPU_ARCHITECTURE" in GPU_ARCH_VARIABLES + assert "PYTORCH_ROCM_ARCH" in GPU_ARCH_VARIABLES + assert "GFX_COMPILATION_ARCH" in GPU_ARCH_VARIABLES + + +class TestParseDockerfileGpuVariables: + def test_empty_content(self): + assert parse_dockerfile_gpu_variables("") == {} + assert parse_dockerfile_gpu_variables("FROM ubuntu") == {} + + def test_arg_parsed(self): + content = "ARG PYTORCH_ROCM_ARCH=gfx90a" + out = parse_dockerfile_gpu_variables(content) + assert "PYTORCH_ROCM_ARCH" in out + assert out["PYTORCH_ROCM_ARCH"] == ["gfx90a"] + + def test_multi_arch_semicolon(self): + content = "ARG GPU_TARGETS=gfx90a;gfx908" + out = parse_dockerfile_gpu_variables(content) + assert "GPU_TARGETS" in out + assert set(out["GPU_TARGETS"]) == {"gfx90a", "gfx908"} + + def test_takes_last_definition(self): + content = "ARG PYTORCH_ROCM_ARCH=gfx908\nARG PYTORCH_ROCM_ARCH=gfx90a" + out = parse_dockerfile_gpu_variables(content) + assert out["PYTORCH_ROCM_ARCH"] == ["gfx90a"] + + +class TestNormalizeArchitectureName: + def test_gfx_passthrough(self): + assert normalize_architecture_name("gfx90a") == "gfx90a" + assert normalize_architecture_name("gfx942") == "gfx942" + + def test_mi_aliases(self): + assert normalize_architecture_name("mi100") == "gfx908" + assert normalize_architecture_name("mi-200") == "gfx90a" + assert normalize_architecture_name("mi300x") == "gfx942" + + def test_empty_returns_none(self): + assert normalize_architecture_name("") is None + assert normalize_architecture_name(" ") is None + + +class TestIsTargetArchCompatibleWithVariable: + def test_mad_system_always_compatible(self): + assert is_target_arch_compatible_with_variable( + "MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx90a"], "gfx908" + ) is True + + def test_multi_arch_target_in_list(self): + assert is_target_arch_compatible_with_variable( + "PYTORCH_ROCM_ARCH", ["gfx90a", "gfx908"], "gfx90a" + ) is True + assert is_target_arch_compatible_with_variable( + "GPU_TARGETS", ["gfx90a"], "gfx908" + ) is False + + def test_gfx_compilation_exact_match(self): + assert is_target_arch_compatible_with_variable( + "GFX_COMPILATION_ARCH", ["gfx90a"], "gfx90a" + ) is True + + +class TestIsCompilationArchCompatible: + def test_exact_match(self): + assert is_compilation_arch_compatible("gfx90a", "gfx90a") is True + assert is_compilation_arch_compatible("gfx942", "gfx942") is True + + def test_mismatch(self): + assert is_compilation_arch_compatible("gfx90a", "gfx908") is False + assert is_compilation_arch_compatible("gfx908", "gfx90a") is False + + def test_unknown_arch_treated_as_exact(self): + assert is_compilation_arch_compatible("gfx999", "gfx999") is True + assert is_compilation_arch_compatible("gfx999", "gfx90a") is False diff --git a/tests/unit/test_orchestration.py b/tests/unit/test_orchestration.py new file mode 100644 index 00000000..6c82ff9b --- /dev/null +++ b/tests/unit/test_orchestration.py @@ -0,0 +1,179 @@ +"""Unit tests for orchestration: image_filtering and orchestrator init/validation.""" + +import pytest +from unittest.mock import MagicMock, patch + +from madengine.orchestration.image_filtering import ( + filter_images_by_gpu_compatibility, + filter_images_by_skip_gpu_arch, +) +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.run_orchestrator import RunOrchestrator +from madengine.core.errors import ConfigurationError + + +# ---- image_filtering ---- + +class TestFilterImagesByGpuCompatibility: + """filter_images_by_gpu_compatibility behavior.""" + + def test_empty_input(self): + compat, skipped = filter_images_by_gpu_compatibility({}, "AMD", "gfx90a") + assert compat == {} + assert skipped == [] + + def test_no_vendor_treated_as_compatible(self): + built = {"m1": {"gpu_vendor": "", "gpu_architecture": ""}} + compat, skipped = filter_images_by_gpu_compatibility(built, "AMD", "gfx90a") + assert compat == {"m1": built["m1"]} + assert skipped == [] + + def test_vendor_match_included_with_or_without_arch(self): + """Vendor match with empty arch or matching arch both include the image.""" + for gpu_arch in ["", "gfx90a"]: + built = {"m1": {"gpu_vendor": "AMD", "gpu_architecture": gpu_arch}} + compat, skipped = filter_images_by_gpu_compatibility(built, "AMD", "gfx90a") + assert compat == {"m1": built["m1"]} + assert skipped == [] + + def test_vendor_match_arch_mismatch_skipped(self): + built = {"m1": {"gpu_vendor": "AMD", "gpu_architecture": "gfx90a"}} + compat, skipped = filter_images_by_gpu_compatibility(built, "AMD", "sm_90") + assert compat == {} + assert len(skipped) == 1 + assert skipped[0][0] == "m1" + assert "architecture mismatch" in skipped[0][1] + + def test_vendor_mismatch_skipped(self): + built = {"m1": {"gpu_vendor": "NVIDIA", "gpu_architecture": "sm_90"}} + compat, skipped = filter_images_by_gpu_compatibility(built, "AMD", "gfx90a") + assert compat == {} + assert len(skipped) == 1 + assert "vendor mismatch" in skipped[0][1] + + def test_none_runtime_vendor_accepts_any_vendor(self): + built = {"m1": {"gpu_vendor": "AMD", "gpu_architecture": "gfx90a"}} + compat, skipped = filter_images_by_gpu_compatibility(built, "NONE", "gfx90a") + assert compat == {"m1": built["m1"]} + assert skipped == [] + + +class TestFilterImagesBySkipGpuArch: + """filter_images_by_skip_gpu_arch behavior.""" + + def test_disable_skip_returns_all(self): + built = {"m1": {}} + models = {"m1": {"skip_gpu_arch": "A100"}} + compat, skipped = filter_images_by_skip_gpu_arch( + built, models, "A100", disable_skip=True + ) + assert compat == built + assert skipped == [] + + def test_no_skip_gpu_arch_included(self): + built = {"m1": {"img": "x"}} + models = {"m1": {}} + compat, skipped = filter_images_by_skip_gpu_arch(built, models, "A100") + assert compat == {"m1": built["m1"]} + assert skipped == [] + + def test_skip_gpu_arch_match_skipped(self): + built = {"m1": {"img": "x"}} + models = {"m1": {"skip_gpu_arch": "A100"}} + compat, skipped = filter_images_by_skip_gpu_arch(built, models, "A100") + assert compat == {} + assert len(skipped) == 1 + assert skipped[0] == ("m1", built["m1"], "A100") + + def test_skip_gpu_arch_nvidia_prefix_normalized(self): + built = {"m1": {}} + models = {"m1": {"skip_gpu_arch": "A100"}} + compat, skipped = filter_images_by_skip_gpu_arch( + built, models, "NVIDIA A100" + ) + assert compat == {} + assert skipped[0][2] == "NVIDIA A100" + + def test_skip_gpu_arch_no_match_included(self): + built = {"m1": {}} + models = {"m1": {"skip_gpu_arch": "A100"}} + compat, skipped = filter_images_by_skip_gpu_arch(built, models, "gfx90a") + assert compat == {"m1": built["m1"]} + assert skipped == [] + + +# ---- orchestrator init and validation ---- + +@pytest.mark.unit +class TestBuildOrchestratorInit: + """Test Build Orchestrator initialization.""" + + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + def test_initializes_with_minimal_args(self, mock_exists, mock_context): + """Should initialize with minimal arguments.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.additional_context == {} + assert orchestrator.credentials is None + + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + def test_parses_additional_context_json(self, mock_exists, mock_context): + """Should parse JSON additional context.""" + mock_args = MagicMock() + mock_args.additional_context = '{"key": "value"}' + mock_args.live_output = True + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.additional_context == {"key": "value"} + + +@pytest.mark.unit +class TestRunOrchestratorInit: + """Test Run Orchestrator initialization.""" + + @patch("madengine.orchestration.run_orchestrator.Context") + def test_initializes_with_args(self, mock_context): + """Should initialize with provided arguments.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.additional_context == {} + + def test_parses_deploy_type_from_context(self): + """Should extract deploy type from additional context.""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "slurm"}' + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.additional_context["deploy"] == "slurm" + + +@pytest.mark.unit +class TestManifestValidation: + """Test manifest validation logic.""" + + @patch("os.path.exists", return_value=False) + def test_run_without_manifest_or_tags_raises_error(self, mock_exists): + """Should raise ConfigurationError without manifest or tags.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + with pytest.raises(ConfigurationError): + orchestrator.execute(manifest_file=None, tags=None) diff --git a/tests/unit/test_orchestrator_logic.py b/tests/unit/test_orchestrator_logic.py deleted file mode 100644 index 4f0aaa6d..00000000 --- a/tests/unit/test_orchestrator_logic.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Orchestrator logic unit tests. - -Pure unit tests for orchestrator initialization and logic without external dependencies. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -import pytest -from unittest.mock import MagicMock, mock_open, patch - -from madengine.orchestration.build_orchestrator import BuildOrchestrator -from madengine.orchestration.run_orchestrator import RunOrchestrator -from madengine.core.errors import ConfigurationError - - -@pytest.mark.unit -class TestBuildOrchestratorInit: - """Test Build Orchestrator initialization.""" - - @patch("madengine.orchestration.build_orchestrator.Context") - @patch("os.path.exists", return_value=False) - def test_initializes_with_minimal_args(self, mock_exists, mock_context): - """Should initialize with minimal arguments.""" - mock_args = MagicMock() - mock_args.additional_context = None - mock_args.live_output = True - - orchestrator = BuildOrchestrator(mock_args) - - assert orchestrator.args == mock_args - assert orchestrator.additional_context == {} - assert orchestrator.credentials is None - - @patch("madengine.orchestration.build_orchestrator.Context") - @patch("os.path.exists", return_value=False) - def test_parses_additional_context_json(self, mock_exists, mock_context): - """Should parse JSON additional context.""" - mock_args = MagicMock() - mock_args.additional_context = '{"key": "value"}' - mock_args.live_output = True - - orchestrator = BuildOrchestrator(mock_args) - - assert orchestrator.additional_context == {"key": "value"} - - -@pytest.mark.unit -class TestRunOrchestratorInit: - """Test Run Orchestrator initialization.""" - - @patch("madengine.orchestration.run_orchestrator.Context") - def test_initializes_with_args(self, mock_context): - """Should initialize with provided arguments.""" - mock_args = MagicMock() - mock_args.additional_context = None - mock_args.live_output = True - - orchestrator = RunOrchestrator(mock_args) - - assert orchestrator.args == mock_args - assert orchestrator.additional_context == {} - - def test_parses_deploy_type_from_context(self): - """Should extract deploy type from additional context.""" - mock_args = MagicMock() - mock_args.additional_context = '{"deploy": "slurm"}' - mock_args.live_output = True - - orchestrator = RunOrchestrator(mock_args) - - assert orchestrator.additional_context["deploy"] == "slurm" - - -@pytest.mark.unit -class TestManifestValidation: - """Test manifest validation logic.""" - - @patch("os.path.exists", return_value=False) - def test_run_without_manifest_or_tags_raises_error(self, mock_exists): - """Should raise ConfigurationError without manifest or tags.""" - mock_args = MagicMock() - mock_args.additional_context = None - mock_args.live_output = True - - orchestrator = RunOrchestrator(mock_args) - - with pytest.raises(ConfigurationError): - orchestrator.execute(manifest_file=None, tags=None) - - -# Total: 5 unit tests diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index f33916eb..c0e3dbd9 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -20,10 +20,14 @@ def test_get_rocm_path_default(self, monkeypatch): path = get_rocm_path(None) assert path == "/opt/rocm" - def test_get_rocm_path_override(self): - """Override argument takes precedence.""" + def test_get_rocm_path_override(self, monkeypatch): + """Override argument takes precedence over env.""" path = get_rocm_path("/custom/rocm") assert path == os.path.abspath("/custom/rocm").rstrip(os.sep) + monkeypatch.setenv("ROCM_PATH", "/env/rocm") + path_with_env = get_rocm_path("/cli/rocm") + assert path_with_env == os.path.abspath("/cli/rocm").rstrip(os.sep) + monkeypatch.delenv("ROCM_PATH", raising=False) def test_get_rocm_path_env(self, monkeypatch): """ROCM_PATH env is used when override is None.""" @@ -34,15 +38,6 @@ def test_get_rocm_path_env(self, monkeypatch): finally: monkeypatch.delenv("ROCM_PATH", raising=False) - def test_get_rocm_path_override_overrides_env(self, monkeypatch): - """Override takes precedence over ROCM_PATH env.""" - monkeypatch.setenv("ROCM_PATH", "/env/rocm") - try: - path = get_rocm_path("/cli/rocm") - assert path == os.path.abspath("/cli/rocm").rstrip(os.sep) - finally: - monkeypatch.delenv("ROCM_PATH", raising=False) - @pytest.mark.unit class TestContextRocmPath: diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 00000000..2e09b8c3 --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,93 @@ +"""Unit tests for utils: path_utils and run_details.""" + +import os +from pathlib import Path +from unittest.mock import patch + +import pytest + +from madengine.utils.path_utils import scripts_base_dir_from, get_madengine_root +from madengine.utils.run_details import ( + get_pipeline, + get_build_number, + flatten_tags_in_place, +) + + +# ---- path_utils ---- + +class TestScriptsBaseDirFrom: + """Test scripts_base_dir_from helper.""" + + @pytest.mark.parametrize("value", [None, "", " "]) + def test_falsy_input_returns_none(self, value): + """None, empty string, or whitespace return None.""" + assert scripts_base_dir_from(value) is None + + def test_file_path_returns_parent_dir(self): + """Path to a file returns its parent directory.""" + assert scripts_base_dir_from("/foo/bar/script.sh") == "/foo/bar" + assert scripts_base_dir_from("scripts/run.sh") == "scripts" + + def test_dir_path_returns_parent_of_dir(self): + """Path to a directory returns the parent of that directory.""" + assert scripts_base_dir_from("/foo/bar/baz") == "/foo/bar" + + def test_single_segment_returns_empty_string(self): + """Single segment (no slash) returns empty string from dirname.""" + assert scripts_base_dir_from("script.sh") == "" + + +class TestGetMadengineRoot: + """Test get_madengine_root helper.""" + + def test_returns_madengine_package_path(self): + """Returns a Path that exists, is a directory, and ends with 'madengine'.""" + root = get_madengine_root() + assert isinstance(root, Path) + assert root.name == "madengine" + assert root.exists() and root.is_dir() + + +# ---- run_details ---- + +class TestGetPipeline: + @pytest.mark.parametrize("env_val,expected", [({}, ""), ({"pipeline": "ci-mad"}, "ci-mad")]) + def test_pipeline_from_env_or_default(self, env_val, expected): + with patch.dict(os.environ, env_val, clear=False): + if not env_val and "pipeline" in os.environ: + del os.environ["pipeline"] + assert get_pipeline() == expected + + +class TestGetBuildNumber: + @pytest.mark.parametrize("env_val,expected", [({}, "0"), ({"BUILD_NUMBER": "42"}, "42")]) + def test_build_number_from_env_or_default(self, env_val, expected): + with patch.dict(os.environ, env_val, clear=False): + if not env_val and "BUILD_NUMBER" in os.environ: + del os.environ["BUILD_NUMBER"] + assert get_build_number() == expected + + +class TestFlattenTagsInPlace: + def test_list_converted_to_comma_string(self): + record = {"tags": ["a", "b", "c"], "other": 1} + flatten_tags_in_place(record) + assert record["tags"] == "a,b,c" + assert record["other"] == 1 + + def test_non_list_unchanged(self): + record = {"tags": "already,a,string"} + flatten_tags_in_place(record) + assert record["tags"] == "already,a,string" + + def test_missing_tags_unchanged(self): + record = {"model": "m1"} + flatten_tags_in_place(record) + assert "tags" not in record + assert record["model"] == "m1" + + def test_empty_list_converted_to_empty_string(self): + record = {"tags": []} + flatten_tags_in_place(record) + assert record["tags"] == ""