Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/profiling-configs/rocprofv3_multi_gpu.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 1,
"gpus_per_node": 4,
"time": "02:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": false
},

Expand Down
2 changes: 1 addition & 1 deletion examples/profiling-configs/rocprofv3_multi_node.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 2,
"gpus_per_node": 4,
"time": "04:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
14 changes: 7 additions & 7 deletions examples/slurm-configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ madengine uses intelligent multi-layer configuration merging:
"nodelist": "node01,node02", // Optional: restrict job to these nodes (skips node health preflight)
"gpus_per_node": 8, // GPUs per node
"time": "24:00:00", // Wall time (HH:MM:SS)
"output_dir": "./slurm_output", // Local output directory
"output_dir": "./slurm_results", // Local output directory
"results_dir": "/shared/results", // Shared results collection
"shared_workspace": "/shared/workspace", // Shared workspace (NFS/Lustre)
"exclusive": true, // Exclusive node access
Expand Down Expand Up @@ -584,10 +584,10 @@ squeue -u $USER
scontrol show job <job_id>

# View output logs (real-time)
tail -f slurm_output/madengine-*_<job_id>_*.out
tail -f slurm_results/madengine-*_<job_id>_*.out

# View error logs
tail -f slurm_output/madengine-*_<job_id>_*.err
tail -f slurm_results/madengine-*_<job_id>_*.err

# Cancel job if needed
scancel <job_id>
Expand All @@ -600,7 +600,7 @@ scancel <job_id>
- Check SLURM partition exists: `sinfo`
- Verify GPU resources available: `sinfo -o "%P %.5a %.10l %.6D %.6t %N %G"`
- Check SLURM account/QoS settings
- Review job script: `slurm_output/madengine_*.sh`
- Review job script: `slurm_results/madengine_*.sh`

### Out of Memory Errors

Expand Down Expand Up @@ -715,8 +715,8 @@ MODEL_DIR=models/my-model madengine run \
watch squeue -u $USER

# 6. Check logs when complete
ls -lh slurm_output/
tail -f slurm_output/madengine-*_<job_id>_*.out
ls -lh slurm_results/
tail -f slurm_results/madengine-*_<job_id>_*.out
```

### vLLM Inference Workflow
Expand All @@ -739,7 +739,7 @@ MODEL_DIR=models/llama2-70b madengine run \
--manifest-file build_manifest.json

# 5. Monitor for OOM errors
tail -f slurm_output/madengine-*_<job_id>_*.err | grep -i "memory"
tail -f slurm_results/madengine-*_<job_id>_*.err | grep -i "memory"

# 6. If OOM occurs, adjust config and rebuild
# Edit your config file to set VLLM_KV_CACHE_SIZE to 0.6 or 0.7
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 1,
"gpus_per_node": 1,
"time": "01:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": false
},

Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/02-single-node-multi-gpu.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 1,
"gpus_per_node": 8,
"time": "12:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"nodelist": "node01,node02",
"gpus_per_node": 8,
"time": "24:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true,
"network_interface": "eth0"
},
Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/03-multi-node-basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 2,
"gpus_per_node": 8,
"time": "24:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true,
"network_interface": "eth0"
},
Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/04-multi-node-advanced.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 4,
"gpus_per_node": 8,
"time": "48:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"results_dir": "/shared/results",
"shared_workspace": "/shared/workspace",
"exclusive": true,
Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/05-vllm-single-node.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 1,
"gpus_per_node": 4,
"time": "02:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/06-vllm-multi-node.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"nodes": 2,
"gpus_per_node": 4,
"time": "00:45:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true,

"_comment_node_check": "Preflight GPU health check (helps avoid OOM from stale processes)",
Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/07-sglang-single-node.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 1,
"gpus_per_node": 4,
"time": "02:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/08-sglang-multi-node.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"nodes": 2,
"gpus_per_node": 4,
"time": "04:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/cluster-amd-rccl.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"nodes": 1,
"gpus_per_node": 8,
"time": "12:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"nodes": 7,
"gpus_per_node": 8,
"time": "04:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
2 changes: 1 addition & 1 deletion examples/slurm-configs/basic/sglang-disagg-multi-node.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"nodes": 5,
"gpus_per_node": 8,
"time": "04:00:00",
"output_dir": "./slurm_output",
"output_dir": "./slurm_results",
"exclusive": true
},

Expand Down
14 changes: 8 additions & 6 deletions src/madengine/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row
console.print("[yellow]⚠️ Performance CSV is empty[/yellow]")
return

# Get session_start_row to mark current runs (don't filter, just mark)
total_rows = len(df)

# Try parameter first, then fall back to marker file
Expand All @@ -378,7 +377,7 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row
perf_table.add_column("Index", justify="right", style="dim")
perf_table.add_column("Model", style="cyan")
perf_table.add_column("Topology", justify="center", style="blue")
perf_table.add_column("Launcher", justify="center", style="magenta") # Distributed launcher
perf_table.add_column("Launcher", justify="center", style="magenta")
perf_table.add_column("Deployment", justify="center", style="cyan")
perf_table.add_column("GPU Arch", style="yellow")
perf_table.add_column("Performance", justify="right", style="green")
Expand All @@ -388,20 +387,23 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row
perf_table.add_column("Data Name", style="magenta")
perf_table.add_column("Data Provider", style="magenta")

# Helper function to format duration
# Helper function to format duration (accepts float seconds or "Xs" string)
def format_duration(duration):
if pd.isna(duration) or duration == "":
if pd.isna(duration) or duration == "" or duration is None:
return "N/A"
try:
dur = float(duration)
if isinstance(duration, str) and duration.strip().endswith("s"):
dur = float(duration.strip()[:-1])
else:
dur = float(duration)
if dur < 1:
return f"{dur*1000:.0f}ms"
elif dur < 60:
return f"{dur:.2f}s"
else:
return f"{dur/60:.1f}m"
except (ValueError, TypeError):
return "N/A"
return str(duration) if duration else "N/A"

# Helper function to format performance
def format_performance(perf):
Expand Down
Loading