diff --git a/MLExamples/TinyTransformer/README.md b/MLExamples/TinyTransformer/README.md index bb1bd657..e63aa322 100644 --- a/MLExamples/TinyTransformer/README.md +++ b/MLExamples/TinyTransformer/README.md @@ -42,7 +42,7 @@ This workshop follows a progressive optimization methodology with four implement | **V3 Triton** | 156,652 | 52.3 | 51.3 | 0.6 | 0.4 | 916.2 | **3.13x** | | **V4 Ultra** | 157,169 | 52.1 | 51.1 | 0.6 | 0.4 | 916.5 | **3.14x** | -**See [PERFORMANCE_RESULTS.md](PERFORMANCE_RESULTS.md) for complete analysis** +Performance figures for the small and medium configurations are summarized in the tables above and in [Key Performance Insights](#key-performance-insights). ### Profiling Tools Progression @@ -70,15 +70,15 @@ Each version introduces additional profiling capabilities: ## Quick Start -### 0. Set up environment -On the training cluster's compute node, the required environment may be set up using the following -commands: +### 0. Set up and verify environment +On the training cluster's compute node, load the modules (adjust names/versions for your site): ```bash -module load rocm pytorch openmpi rocprofiler-compute rocprofiler-systems/develop +module load rocm pytorch ``` -### 1. Verify Environment +Then confirm ROCm, PyTorch, and the GPU(s) are setup correctly: + ```bash # Check ROCm installation rocminfo @@ -90,7 +90,7 @@ rocm-smi python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')" ``` -### 2. Run Version 1 (Baseline) - 5 minutes +### 1. Run Version 1 (Baseline) - 5 minutes ```bash cd version1_pytorch_baseline/ python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20 @@ -105,7 +105,7 @@ For a deeper analysis with the PyTorch profiler, and visualizing the output in T please follow the workshop exercises in [version1_pytorch_baseline/README.md](https://github.com/amd/HPCTrainingExamples/tree/main/MLExamples/TinyTransformer/version1_pytorch_baseline#workshop-exercises). -### 3. Run Version 2 (Fused) - 5 minutes +### 2. Run Version 2 (Fused) - 5 minutes ```bash cd version2_pytorch_fused python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30 @@ -148,7 +148,7 @@ with `rocprof-sys` using the command below: ```bash rocprof-sys-run --profile --trace -- python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 30 ``` -View the trace at [https://ui.perfetto.dev](https://ui.perfetto.dev). +View the trace with [https://ui.perfetto.dev](https://ui.perfetto.dev). ### 4. Run Version 4 (Ultra optimized) - 5 minutes ```bash @@ -174,75 +174,33 @@ cd version3_triton/exercises/performance_debugging/ ## Directory Structure +Layout under `MLExamples/TinyTransformer/` in this repository: + ``` -ai-workshop-training/ - README.md # This overview - setup/ # Environment and prerequisites - environment_setup.md # Detailed setup instructions - environment_setup.sh # Automated setup script - requirements.txt # Python dependencies - validation_scripts/ # Environment validation - test_environment.py # Comprehensive environment test - test_rocm_installation.py # ROCm stack validation - test_profiling_tools.py # Profiling tools validation - version1_pytorch_baseline/ # Standard PyTorch implementation - README.md # Detailed guided instructions - tiny_llama_v1.py # Enhanced baseline implementation - run_pytorch_profiler.py # PyTorch profiler integration - run_deepspeed_flops.py # DeepSpeed FLOPS profiler - run_all_profilers.sh # Orchestrated profiling script - exercises/ # Hands-on exercises and analysis - exercise_1_baseline_analysis.md - exercise_2_memory_analysis.md - exercise_3_bottleneck_identification.md - version2_pytorch_fused/ # Fused operations optimization - README.md # Fusion optimization guide - tiny_llama_v2.py # Fused implementation - run_pytorch_profiler.py # Enhanced PyTorch profiling - run_deepspeed_flops.py # FLOPS analysis - run_rocprofv3.sh # rocprofv3 integration - run_rocprof_sys.sh # System profiling - run_rocprof_compute.sh # Kernel-level profiling - run_all_profilers.sh # Complete profiling suite - exercises/ # Advanced profiling exercises - exercise_1_fusion_analysis.md - exercise_2_flash_attention.md - exercise_3_rocm_tools_intro.md - version3_triton/ # Triton kernel integration - README.md # Triton optimization guide - tiny_llama_v3.py # Triton-enhanced implementation - triton_kernels.py # Custom Triton kernels - run_pytorch_profiler.py # Framework profiling - run_deepspeed_flops.py # Computational analysis - run_rocprofv3.sh # Legacy profiling - run_rocprof_sys.sh # System monitoring - run_rocprof_compute.sh # Advanced kernel analysis - run_all_profilers.sh # Complete profiling - exercises/ # Triton development exercises - exercise_1_triton_basics.md - exercise_2_custom_kernels.md - exercise_3_performance_tuning.md - version4_pytorch_sdpa/ # Ultra-fused implementation - README.md # Ultra-optimization guide - tiny_llama_v4.py # Ultra-fused implementation - triton_ultra_kernels.py # Ultra-fused kernels - [profiling scripts] # Complete profiling suite - exercises/ # Advanced optimization - exercise_1_ultra_fusion.md - exercise_2_register_optimization.md - exercise_3_production_deployment.md - analysis_tools/ # Performance analysis utilities - compare_versions.py # Cross-version performance comparison - roofline_analysis.py # Roofline model implementation - performance_dashboard.py # Interactive performance dashboard - regression_tester.py # Automated regression testing - report_generator.py # Comprehensive report generation - slides/ # Presentation materials - luka_presentation_materials/ # AI workshop slides - workshop_overview.pptx - profiling_methodology.pptx - optimization_techniques.pptx - results_analysis.pptx +TinyTransformer/ +├── README.md +├── TINY_LLAMA_ARCHITECTURE.md +├── TECHNICAL_APPENDICES.md +├── version1_pytorch_baseline/ +│ ├── tiny_llama_v1.py +│ ├── run_pytorch_profiler.py, run_deepspeed_flops.py, run_all_profilers.sh +│ ├── run_*.sh, launch_performance_study.sh +│ └── exercises/ +│ ├── exercise_1_baseline_analysis.md +│ ├── exercise_2_memory_analysis.md +│ └── exercise_3_bottleneck_identification.md +├── version2_pytorch_fused/ +│ ├── tiny_llama_v2.py +│ ├── run_*.py, run_*.sh, launch_performance_study.sh +│ └── exercises/ +├── version3_triton/ +│ ├── tiny_llama_v3.py, run_triton_profiling.py, run_rocprof_triton.sh +│ ├── launch_performance_study.sh +│ └── exercises/ (including performance_debugging/) +└── version4_pytorch_sdpa/ + ├── tiny_llama_v4.py, run_ultra_profiling.py, launch_performance_study.sh + └── exercises/ + └── exercise1_ultra_fusion.md ``` ## Workshop Execution Timeline @@ -345,10 +303,10 @@ Developed for the CASTIEL AI Workshop (October 16, 2024) by HPC/AI performance e ## License -MIT License - See LICENSE file for details +MIT License — see the repository [`LICENSE.md`](../../LICENSE.md) at the git root of **HPCTrainingExamples**. --- -**Ready to start profiling? Begin with the [Environment Setup Guide](setup/environment_setup.md)** +**Ready to start profiling?** Begin with [Quick Start](#quick-start) (environment modules and first runs) above. diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md index 59d84818..ef30687b 100644 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md @@ -807,7 +807,7 @@ Understanding the available options: ```bash --enable-pytorch-profiler # Enable PyTorch profiler --profile-dir ./profiles # Directory for profile output ---profile-memory # Include memory profiling +--enable-memory-profiling # CUDA alloc records in profiler trace (TensorBoard memory views) --profile-operators # Detailed operator profiling --profile-steps 5 # Number of steps to profile ``` @@ -934,7 +934,7 @@ python3 tiny_llama_v1.py \ --num-steps 20 \ --enable-pytorch-profiler \ --profile-dir ./pytorch_profiles \ - --profile-memory + --enable-memory-profiling ``` **With DeepSpeed FLOPS Profiler:** @@ -1472,7 +1472,7 @@ python3 tiny_llama_v1.py \ --seq-len 128 \ --num-steps 15 \ --enable-pytorch-profiler \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_analysis_bs4 # Batch size 8 @@ -1481,7 +1481,7 @@ python3 tiny_llama_v1.py \ --seq-len 128 \ --num-steps 15 \ --enable-pytorch-profiler \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_analysis_bs8 # Batch size 16 @@ -1490,7 +1490,7 @@ python3 tiny_llama_v1.py \ --seq-len 128 \ --num-steps 15 \ --enable-pytorch-profiler \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_analysis_bs16 ``` @@ -1618,7 +1618,7 @@ python3 tiny_llama_v1.py \ --batch-size 8 \ --seq-len 64 \ --num-steps 10 \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_seq64 # Sequence length 128 (baseline) @@ -1626,7 +1626,7 @@ python3 tiny_llama_v1.py \ --batch-size 8 \ --seq-len 128 \ --num-steps 10 \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_seq128 # Sequence length 256 @@ -1634,7 +1634,7 @@ python3 tiny_llama_v1.py \ --batch-size 8 \ --seq-len 256 \ --num-steps 10 \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_seq256 # Sequence length 512 (might OOM - use smaller batch if needed) @@ -1642,7 +1642,7 @@ python3 tiny_llama_v1.py \ --batch-size 4 \ --seq-len 512 \ --num-steps 5 \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_seq512 ``` @@ -1694,7 +1694,7 @@ python3 tiny_llama_v1.py \ --seq-len 128 \ --num-steps 10 \ --enable-pytorch-profiler \ - --profile-memory \ + --enable-memory-profiling \ --profile-operators \ --profile-dir ./memory_hotspots ``` diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md index 7c1f20d3..8bd03d43 100644 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md @@ -330,34 +330,39 @@ def forward(self, hidden_states, attention_mask=None, position_ids=None): **Objective**: Establish baseline performance metrics and identify computational bottlenecks. #### Step 1: Run Basic Training + +First, run the basic training without any profiling: + ```bash -# Basic training without profiling python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 10 - -# Expected output: Training loss progression and timing info ``` +The script should provide you with the key model architecture, the training progress and key performance metrics. + #### Step 2: Enable PyTorch Profiler + +Next, run the the same setup with the PyTorch profiler enabled via the `--enable-pytorch-profiler` flag: ```bash -# Make directory for the profiles -mkdir pytorch_profiles -# Run with PyTorch profiler enabled +mkdir -p pytorch_profiles python tiny_llama_v1.py \ --batch-size 8 \ --seq-len 128 \ --num-steps 10 \ --enable-pytorch-profiler \ --profile-dir ./pytorch_profiles - -# This generates detailed profiling traces in pytorch_profiles/ ``` +This generates detailed profiling traces in `pytorch_profiles/` #### Step 3: Analyze Profiling Results + +You can visualize the results either via TensorBoard by running ```bash -# Launch TensorBoard to visualize profiles tensorboard --logdir pytorch_profiles --port 6006 +``` +and open the printed link in your browser. -# Or generate text report +Alternatively, we provided a script that generates a text report that you can inspect in the terminal: +```bash python run_pytorch_profiler.py --analyze-existing pytorch_profiles/profile_*.json ``` @@ -398,7 +403,7 @@ python tiny_llama_v1.py \ --batch-size 8 \ --seq-len 128 \ --enable-pytorch-profiler \ - --profile-memory \ + --enable-memory-profiling \ --profile-dir ./memory_analysis # Generate memory timeline visualization diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md index 1cb9b199..9e934a16 100644 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md @@ -8,7 +8,7 @@ Establish baseline performance metrics for Tiny LLaMA V1 and understand the prof ### Prerequisites -- Completed environment setup from `../setup/` +- Environment ready for PyTorch + ROCm (see workshop `README.md` [Quick Start](../../README.md#quick-start)) - Verified environment with validation scripts ### Duration diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/launch_performance_study.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/launch_performance_study.sh index 914eb258..66c08fb1 100755 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/launch_performance_study.sh +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/launch_performance_study.sh @@ -98,7 +98,7 @@ mkdir -p "$OUTPUT_DIR" # Display configuration echo "================================================================================" -echo "CASTILLE AI WORKSHOP - VERSION 1 BASELINE PERFORMANCE STUDY" +echo "CASTIEL AI WORKSHOP - VERSION 1 BASELINE PERFORMANCE STUDY" echo "================================================================================" echo "" echo "Problem Size: ${SIZE^^}" diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/run_all_profilers.sh b/MLExamples/TinyTransformer/version1_pytorch_baseline/run_all_profilers.sh index 591dd774..c8fef5cd 100755 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/run_all_profilers.sh +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/run_all_profilers.sh @@ -110,7 +110,7 @@ done # Print banner echo "=" * 80 -echo "CASTILLE AI WORKSHOP - COMPREHENSIVE PROFILING SUITE" +echo "CASTIEL AI WORKSHOP - COMPREHENSIVE PROFILING SUITE" echo " Version 1: PyTorch Baseline Performance Analysis" echo "=" * 80 echo "" diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py b/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py index defb8dca..fdc20cc4 100644 --- a/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py +++ b/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py @@ -22,7 +22,7 @@ python tiny_llama_v1.py --enable-pytorch-profiler --profile-dir ./profiles # With memory profiling - python tiny_llama_v1.py --enable-pytorch-profiler --profile-memory + python tiny_llama_v1.py --enable-pytorch-profiler --enable-memory-profiling # Complete profiling suite python tiny_llama_v1.py --enable-all-profiling --profile-dir ./complete_analysis @@ -117,6 +117,7 @@ def reset(self): self.metrics = { 'training_speed': [], 'memory_usage': [], + 'gpu_peak_memory_mb': [], 'gpu_utilization': [], 'loss_values': [], 'batch_times': [], @@ -141,8 +142,14 @@ def end_timing(self) -> float: self.start_time = None return elapsed - def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float]): - """Record metrics for a training batch.""" + def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float], + gpu_peak_memory_mb: Optional[float] = None): + """Record metrics for a training batch. + + gpu_peak_memory_mb: per-step peak device memory (bytes->MB) from + torch.cuda.max_memory_allocated() after reset_peak_memory_stats() at + step start; captures transient activations during backward. + """ self.total_samples += batch_size self.metrics['loss_values'].append(loss) self.metrics['batch_times'].append(timings.get('total', 0)) @@ -154,6 +161,8 @@ def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, if torch.cuda.is_available(): memory_mb = torch.cuda.memory_allocated() / (1024**2) self.metrics['memory_usage'].append(memory_mb) + if gpu_peak_memory_mb is not None: + self.metrics['gpu_peak_memory_mb'].append(gpu_peak_memory_mb) # Training speed (samples per second) if timings.get('total', 0) > 0: @@ -175,7 +184,13 @@ def get_summary(self) -> Dict[str, Any]: 'avg_optimizer_time': np.mean(self.metrics['optimizer_times']), } - if self.metrics['memory_usage']: + if self.metrics['gpu_peak_memory_mb']: + summary.update({ + 'peak_memory_mb': max(self.metrics['gpu_peak_memory_mb']), + 'avg_peak_memory_mb': np.mean(self.metrics['gpu_peak_memory_mb']), + 'avg_memory_mb': np.mean(self.metrics['memory_usage']) if self.metrics['memory_usage'] else 0.0, + }) + elif self.metrics['memory_usage']: summary.update({ 'peak_memory_mb': max(self.metrics['memory_usage']), 'avg_memory_mb': np.mean(self.metrics['memory_usage']) @@ -647,6 +662,9 @@ def train_tiny_llama( print("=" * 70) for step in range(num_steps): + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + # Start batch timing batch_timings = {} monitor.start_timing() @@ -692,8 +710,14 @@ def train_tiny_llama( # Total batch time batch_timings['total'] = sum(batch_timings.values()) + peak_mb: Optional[float] = None + if torch.cuda.is_available(): + torch.cuda.synchronize() + peak_mb = torch.cuda.max_memory_allocated() / (1024**2) + # Record metrics - monitor.record_batch_metrics(batch_size, loss.item(), batch_timings) + monitor.record_batch_metrics(batch_size, loss.item(), batch_timings, + gpu_peak_memory_mb=peak_mb) # PyTorch profiler step if pytorch_profiler: @@ -702,12 +726,13 @@ def train_tiny_llama( # Progress logging if step % 10 == 0: speed = batch_size / batch_timings['total'] if batch_timings['total'] > 0 else 0 - memory_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0 + live_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0 + peak_log = f"{peak_mb:6.1f}" if peak_mb is not None else " n/a" print(f"Step {step:3d}/{num_steps} | " f"Loss: {loss.item():.4f} | " f"Speed: {speed:5.1f} samples/sec | " - f"Memory: {memory_mb:6.1f} MB | " + f"Peak: {peak_log} MB | Live: {live_mb:6.1f} MB | " f"Time: {batch_timings['total']*1000:5.1f}ms") print("=" * 70) @@ -743,7 +768,9 @@ def train_tiny_llama( print(f" Final loss: {summary.get('avg_loss', 0):.4f}") if 'peak_memory_mb' in summary: - print(f" Peak memory usage: {summary['peak_memory_mb']:.1f} MB") + print(f" Peak device memory (high-water per step): {summary['peak_memory_mb']:.1f} MB") + if 'avg_peak_memory_mb' in summary: + print(f" Avg peak per step: {summary['avg_peak_memory_mb']:.1f} MB") # Save performance data if profiler_config.profile_dir: @@ -771,6 +798,7 @@ def train_tiny_llama( } profile_path = Path(profiler_config.profile_dir) / "performance_summary.json" + profile_path.parent.mkdir(parents=True, exist_ok=True) with open(profile_path, 'w') as f: json.dump(profile_data, f, indent=2) @@ -815,7 +843,7 @@ def main(): # Print banner print("=" * 80) - print("CASTILLE AI WORKSHOP - VERSION 1: PYTORCH BASELINE") + print("CASTIEL AI WORKSHOP - VERSION 1: PYTORCH BASELINE") print(" Comprehensive Profiling Foundation for Transformer Optimization") print("=" * 80) diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/README.md b/MLExamples/TinyTransformer/version2_pytorch_fused/README.md index 60e73ffe..dd3f8c25 100644 --- a/MLExamples/TinyTransformer/version2_pytorch_fused/README.md +++ b/MLExamples/TinyTransformer/version2_pytorch_fused/README.md @@ -263,8 +263,8 @@ $$ $$ \begin{aligned} -\text{gate\_up} &= x[W_{\text{gate}} \parallel W_{\text{up}}] \quad \text{(Single GEMM)} \\ -\text{gate, up} &= \text{split}(\text{gate\_up}, \text{dim}=-1) \quad \text{(Tensor view)} \\ +\text{tmp}_\text{gate,up} &= x[W_{\text{gate}} \parallel W_{\text{up}}] \quad \text{(Single GEMM)} \\ +\text{gate, up} &= \text{split}(\text{tmp}_\text{gate,up}, \text{dim}=-1) \quad \text{(Tensor view)} \\ \text{output} &= (\text{SiLU}(\text{gate}) \odot \text{up})W_{\text{down}} \quad \text{(Fused activation + projection)} \end{aligned} $$ @@ -474,73 +474,115 @@ def calculate_arithmetic_intensity(operation_type, batch_size, seq_len, hidden_d ## Workshop Exercises +**Host–GPU affinity:** On multi-NUMA systems, it is crucial to pin the CPU cores, local memory, and GPU correctly. Poor affinity increases cross-socket traffic significantly causing misleading timings. +A quick way to pin the Python process to the first CPU and GPU is: +```bash +ROCR_VISIBLE_DEVICES=0 numactl -C 0 -m 0 +``` +See the [Affinity exercises](https://github.com/amd/HPCTrainingExamples/tree/main/Affinity) for how to discover your topology and set the affinity accordingly. + + ### Exercise 1: Kernel Fusion Analysis -**Objective**: Compare baseline vs. fused implementations to quantify fusion benefits. +**Objective**: Compare the unfused, fused, and compiled configurations on the same `tiny_llama_v2.py` code path to quantify the benefits of fusion. + + +#### Step 1: Three-way throughput comparison + +From `version2_pytorch_fused/`, run the same batch size, sequence length, and step count three times. Save each run to its own `--profile-dir` so JSON summaries do not overwrite each other. -#### Step 1: Baseline Comparison ```bash -# Run Version 1 baseline for comparison -cd ../version1_pytorch_baseline -python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 30 > ../version2_baseline_comparison.log +cd version2_pytorch_fused -# Run Version 2 fused implementation -cd ../version2_pytorch_fused -python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30 > fused_performance.log +# 1. Unfused baseline (equivalent to Version 1) +python tiny_llama_v2.py \ + --batch-size 8 --seq-len 128 --num-steps 30 --disable-all-fusion \ + --profile-dir ./bench_no_fusion + +# 2. Fused QKV + Flash Attention + SwiGLU +python tiny_llama_v2.py \ + --batch-size 8 --seq-len 128 --num-steps 30 \ + --profile-dir ./bench_fused + +# 3. Fused + torch.compile +python tiny_llama_v2.py \ + --batch-size 8 --seq-len 128 --num-steps 30 --enable-torch-compile \ + --profile-dir ./bench_torch_compile ``` -#### Step 2: Kernel Count Analysis -```bash -# PyTorch profiler comparison -python run_pytorch_profiler.py --batch-size 8 --profile-dir ./fusion_analysis --generate-report +Compare the performance you see for the different models. What are the differences? + +#### Step 2: Optional operator-level profiling + +Compare the kernel launch patterns between the three cases with the built-in PyTorch profiler: -# Compare kernel counts between versions -python analyze_kernel_reduction.py --baseline ../version1_pytorch_baseline/pytorch_profiles --fused ./fusion_analysis +```bash +python tiny_llama_v2.py \ + --batch-size 8 --seq-len 128 --num-steps 10 --enable-pytorch-profiler \ + --profile-dir ./fusion_analysis ``` +Open the Chrome trace or TensorBoard timeline and compare the unfused and fused versions. Do you see the ~43% fewer attention-related kernels per layer reported by the Python script? -**Expected Results:** -- 40-60% reduction in kernel launch count -- 1.4-1.8x speedup in overall training -- Improved GPU utilization metrics +#### Reference results + +The following reference results have been obtained on an MI300A with PyTorch 2.9.1 and ROCm 7.2.0 with the same model setup as described above. + +| Configuration | Throughput (samples/s) | Avg batch time (ms) | Peak device memory (MB) | +|-----------------|------------------------|---------------------|-------------------------| +| `--disable-all-fusion` (V1-equivalent) | 293 | 27.3 | 998 | +| Default fused | 437 | 18.3 | 967 | +| `--enable-torch-compile` | 794 | 10.1 | 875 | + +On this setup, fusion yields ~**1.5×** throughput over the unfused path; adding `torch.compile` reaches ~**2.7×** vs. unfused and ~**1.8×** vs. fused alone. +With the short sequence length of `seq=128`, the majority of the memory is consumed by the weights and gradients leading to only minor differences in peak memory between the versions. +Continue to exercise 2 to learn more about the impact of kernel fusion and Flash Attention on the memory consumption. ### Exercise 2: Flash Attention Memory Analysis -**Objective**: Analyze memory efficiency improvements from Flash Attention. +**Objective**: Show how peak device memory scales with sequence length for naive attention vs. Flash Attention. + +#### Memory scaling of unfused and fused attention + +Next, investigate how the memory consumption scales if we increase the sequence length with both naive unfused attention and the fused Flash Attention kernel. +For this, enable `--enable-memory-profiling` so the summary reports **peak device memory** per run. Keep `batch-size 4` and `num-steps 20` fixed while sweeping sequence length. +Run this for both variants and compare the scaling. Below, you can find some reference results to compare to. -#### Step 1: Memory Scaling Test ```bash -# Test memory scaling with sequence length for seq_len in 128 256 512 1024; do python tiny_llama_v2.py \ --seq-len $seq_len \ --batch-size 4 \ + --num-steps 20 \ --enable-memory-profiling \ --profile-dir ./flash_attention_seq${seq_len} done ``` -#### Step 2: Memory Bandwidth Analysis -```bash -# Analyze memory bandwidth utilization -python run_deepspeed_flops.py \ - --batch-size 8 \ - --seq-len 256 \ - --computational-intensity \ - --generate-roofline -``` +#### Reference results -**Expected Results:** +The following reference results have been obtained on an MI300A with PyTorch 2.9.1 and ROCm 7.2.0 with the same model setup as described above. -- Linear memory scaling vs. quadratic for baseline -- 2-4x memory reduction for longer sequences -- Improved arithmetic intensity metrics +| Configuration | seq=128 | seq=256 | seq=512 | seq=1024 | +|---------------|---------|---------|---------|----------| +| `--disable-all-fusion` | 764 | 1031 | 1669 | 3471 | +| Default fused (Flash Attention) | 764 | 967 | 1414 | 2302 | +| Ratio | 1.00x | 1.06x | 1.18x | 1.51x | -### Exercise 3: ROCm Tools Deep Dive +Clearly, the fused attention kernel reduces the required memory significantly. Why is that? +Unfused attention materializes an $ S \times S $ attention matrix, so the peak memory rises close to **quadratically** in sequence length once that tensor dominates. Flash Attention avoids storing the full matrix as it computes the local attention scores on-the-fly resulting in a roughly **linear** scaling in $ S $. At `seq=128`, exhibit the same memory footprint since the the majority of the occupied memory is consumed by the weights and activations. The attention matrix only becomes the dominant factor for larger sequence lengths. -**Objective**: Master ROCm profiling tools for hardware-level optimization. +Does the further fusion with `torch.compile` lower the peak even more? Try it out! + +### Exercise 3: Using ROCm Tools + +**Objective**: Explore ROCm profiling tools for hardware-level optimization. AMD offers three performance profiling tools for ROCm based applications: -`rocprofv3`, `rocprof-sys`, and `rocprof-compute`. For more details about these tools, see + - `rocprofv3` (hotspot analysis and timeline traces) + - `rocprof-sys` (hotspot and timeline profiling including CPU and MPI) + - `rocprof-compute` (in-depth profiling of kernel) + +For more details about these tools, see [Appendix C of the TECHNICAL_APPENDICES.md](https://github.com/amd/HPCTrainingExamples/blob/main/MLExamples/TinyTransformer/TECHNICAL_APPENDICES.md#appendix-c-rocm-profiling-tools-reference). about each tool. @@ -549,11 +591,14 @@ about each tool. Running rocprofv3 to collect GPU hotspots on this example would look like this: ```bash -rocprofv3 --kernel-trace --stats --truncate-kernels -- python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30 +rocprofv3 --kernel-trace -S --stats --truncate-kernels --output-format csv -- \ + python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 30 ``` View the `_kernel_stats.csv` file to see the GPU kernel hotspots. +Note: Since the statistics are computed per kernel name, the `--truncate-kernels` argument might kernels with similar signatures into the same truncated name. + #### Step 2: rocprof-sys System Analysis To collect a comprehensive timeline trace with host and device activity, run rocprof-sys as shown below: @@ -574,7 +619,7 @@ rocprof-compute profile -n roof --kernel-names --roof-only --device 0 -- python This generates three PDF files: two roofline plots and a legend. -To collect a profile, then analyze a particular dispatch, run the following commands: +To collect a profile, then analyze a particular kernel dispatch, run the following commands: ```bash rocprof-compute profile -n ver2 --no-roof -- python3 tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30 diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/launch_performance_study.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/launch_performance_study.sh index 7c5f4355..1f9c81a9 100755 --- a/MLExamples/TinyTransformer/version2_pytorch_fused/launch_performance_study.sh +++ b/MLExamples/TinyTransformer/version2_pytorch_fused/launch_performance_study.sh @@ -98,7 +98,7 @@ mkdir -p "$OUTPUT_DIR" # Display configuration echo "================================================================================" -echo "CASTILLE AI WORKSHOP - VERSION 2 PYTORCH FUSED PERFORMANCE STUDY" +echo "CASTIEL AI WORKSHOP - VERSION 2 PYTORCH FUSED PERFORMANCE STUDY" echo " Kernel Fusion Optimizations (QKV, Flash Attention, SwiGLU, Torch Compile)" echo "================================================================================" echo "" diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/run_all_profilers.sh b/MLExamples/TinyTransformer/version2_pytorch_fused/run_all_profilers.sh index 314fc39c..0a904859 100755 --- a/MLExamples/TinyTransformer/version2_pytorch_fused/run_all_profilers.sh +++ b/MLExamples/TinyTransformer/version2_pytorch_fused/run_all_profilers.sh @@ -191,7 +191,7 @@ done # Print banner echo "=" * 80 -echo "CASTILLE AI WORKSHOP - COMPREHENSIVE PROFILING SUITE V2" +echo "CASTIEL AI WORKSHOP - COMPREHENSIVE PROFILING SUITE V2" echo " Fusion Optimization Analysis with Complete ROCm Tools Integration" echo "=" * 80 echo "" diff --git a/MLExamples/TinyTransformer/version2_pytorch_fused/tiny_llama_v2.py b/MLExamples/TinyTransformer/version2_pytorch_fused/tiny_llama_v2.py index 716e8225..e41eafdf 100644 --- a/MLExamples/TinyTransformer/version2_pytorch_fused/tiny_llama_v2.py +++ b/MLExamples/TinyTransformer/version2_pytorch_fused/tiny_llama_v2.py @@ -141,6 +141,7 @@ def reset(self): self.metrics = { 'training_speed': [], 'memory_usage': [], + 'gpu_peak_memory_mb': [], 'gpu_utilization': [], 'loss_values': [], 'batch_times': [], @@ -168,8 +169,14 @@ def end_timing(self) -> float: self.start_time = None return elapsed - def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float], fusion_stats: Dict[str, Any] = None): - """Record metrics for a training batch with fusion statistics.""" + def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float], fusion_stats: Dict[str, Any] = None, + gpu_peak_memory_mb: Optional[float] = None): + """Record metrics for a training batch with fusion statistics. + + gpu_peak_memory_mb: per-step peak device memory (bytes->MB) from + torch.cuda.max_memory_allocated() after reset_peak_memory_stats() at + step start; captures transient activations during backward. + """ self.total_samples += batch_size self.metrics['loss_values'].append(loss) self.metrics['batch_times'].append(timings.get('total', 0)) @@ -181,6 +188,8 @@ def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, if torch.cuda.is_available(): memory_mb = torch.cuda.memory_allocated() / (1024**2) self.metrics['memory_usage'].append(memory_mb) + if gpu_peak_memory_mb is not None: + self.metrics['gpu_peak_memory_mb'].append(gpu_peak_memory_mb) # Training speed if timings.get('total', 0) > 0: @@ -206,7 +215,13 @@ def get_summary(self) -> Dict[str, Any]: 'avg_optimizer_time': np.mean(self.metrics['optimizer_times']), } - if self.metrics['memory_usage']: + if self.metrics['gpu_peak_memory_mb']: + summary.update({ + 'peak_memory_mb': max(self.metrics['gpu_peak_memory_mb']), + 'avg_peak_memory_mb': np.mean(self.metrics['gpu_peak_memory_mb']), + 'avg_memory_mb': np.mean(self.metrics['memory_usage']) if self.metrics['memory_usage'] else 0.0, + }) + elif self.metrics['memory_usage']: summary.update({ 'peak_memory_mb': max(self.metrics['memory_usage']), 'avg_memory_mb': np.mean(self.metrics['memory_usage']) @@ -223,10 +238,16 @@ def get_summary(self) -> Dict[str, Any]: fusion_summary = {} for key, values in total_fusion_stats.items(): - if isinstance(values[0], (int, float)): - fusion_summary[f'avg_{key}'] = np.mean(values) + sample = values[0] + # bool subclasses int — must branch on bool first so flags keep canonical keys + if isinstance(sample, bool): + fusion_summary[key] = bool(sample) + elif isinstance(sample, int): + fusion_summary[key] = int(round(np.mean(values))) + elif isinstance(sample, float): + fusion_summary[key] = float(np.mean(values)) else: - fusion_summary[key] = values[-1] # Keep latest non-numeric value + fusion_summary[key] = values[-1] summary['fusion_statistics'] = fusion_summary @@ -818,6 +839,9 @@ def train_tiny_llama_v2( print("=" * 70) for step in range(num_steps): + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + # Start batch timing batch_timings = {} monitor.start_timing() @@ -863,12 +887,18 @@ def train_tiny_llama_v2( # Total batch time batch_timings['total'] = sum(batch_timings.values()) + peak_mb: Optional[float] = None + if torch.cuda.is_available(): + torch.cuda.synchronize() + peak_mb = torch.cuda.max_memory_allocated() / (1024**2) + # Record metrics with fusion statistics monitor.record_batch_metrics( batch_size, loss.item(), batch_timings, - fusion_stats + fusion_stats, + gpu_peak_memory_mb=peak_mb, ) # PyTorch profiler step @@ -878,12 +908,13 @@ def train_tiny_llama_v2( # Progress logging if step % 10 == 0: speed = batch_size / batch_timings['total'] if batch_timings['total'] > 0 else 0 - memory_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0 + live_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0 + peak_log = f"{peak_mb:6.1f}" if peak_mb is not None else " n/a" print(f"Step {step:3d}/{num_steps} | " f"Loss: {loss.item():.4f} | " f"Speed: {speed:5.1f} samples/sec | " - f"Memory: {memory_mb:6.1f} MB | " + f"Peak: {peak_log} MB | Live: {live_mb:6.1f} MB | " f"Time: {batch_timings['total']*1000:5.1f}ms") print("=" * 70) @@ -919,7 +950,11 @@ def train_tiny_llama_v2( print(f" Final loss: {summary.get('avg_loss', 0):.4f}") if 'peak_memory_mb' in summary: - print(f" Peak memory usage: {summary['peak_memory_mb']:.1f} MB") + print(f" Peak device memory (high-water per step): {summary['peak_memory_mb']:.1f} MB") + if 'avg_peak_memory_mb' in summary: + print(f" Avg peak per step: {summary['avg_peak_memory_mb']:.1f} MB") + if 'avg_memory_mb' in summary: + print(f" Avg live allocations after step: {summary['avg_memory_mb']:.1f} MB") # Fusion efficiency summary if 'fusion_statistics' in summary: @@ -1017,7 +1052,7 @@ def main(): # Print banner print("=" * 80) - print("CASTILLE AI WORKSHOP - VERSION 2: PYTORCH FUSED") + print("CASTIEL AI WORKSHOP - VERSION 2: PYTORCH FUSED") print(" Kernel Fusion Optimizations with ROCm Tools Integration") print("=" * 80) @@ -1136,4 +1171,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/MLExamples/TinyTransformer/version3_triton/README.md b/MLExamples/TinyTransformer/version3_triton/README.md index 24d5e8b2..c8f5794a 100644 --- a/MLExamples/TinyTransformer/version3_triton/README.md +++ b/MLExamples/TinyTransformer/version3_triton/README.md @@ -474,7 +474,7 @@ MEMORY_ACCESS_PATTERNS = { Ensure Triton is installed in your environment: ```bash -# Should already be installed from setup/ +# Should already be installed with your PyTorch / ROCm modules pip install triton ``` diff --git a/MLExamples/TinyTransformer/version3_triton/launch_performance_study.sh b/MLExamples/TinyTransformer/version3_triton/launch_performance_study.sh index d9d0d216..912c62e6 100755 --- a/MLExamples/TinyTransformer/version3_triton/launch_performance_study.sh +++ b/MLExamples/TinyTransformer/version3_triton/launch_performance_study.sh @@ -86,7 +86,7 @@ mkdir -p "$OUTPUT_DIR" # Display configuration echo "================================================================================" -echo "CASTILLE AI WORKSHOP - VERSION 3 TRITON CUSTOM KERNELS PERFORMANCE STUDY" +echo "CASTIEL AI WORKSHOP - VERSION 3 TRITON CUSTOM KERNELS PERFORMANCE STUDY" echo " Hand-Optimized GPU Kernels (RMSNorm, SwiGLU, Flash Attention)" echo "================================================================================" echo "" diff --git a/MLExamples/TinyTransformer/version3_triton/tiny_llama_v3.py b/MLExamples/TinyTransformer/version3_triton/tiny_llama_v3.py index 3c8a64cf..cf0e35e3 100644 --- a/MLExamples/TinyTransformer/version3_triton/tiny_llama_v3.py +++ b/MLExamples/TinyTransformer/version3_triton/tiny_llama_v3.py @@ -515,7 +515,7 @@ def train_triton_model( Training mode for Triton-optimized model with comprehensive metrics. """ print("=" * 80) - print("CASTILLE AI WORKSHOP - VERSION 3: TRITON CUSTOM KERNELS") + print("CASTIEL AI WORKSHOP - VERSION 3: TRITON CUSTOM KERNELS") print(" Custom GPU Kernels for Maximum Performance") print("=" * 80) diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md b/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md index 441f52d1..7b90b307 100644 --- a/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md +++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/README.md @@ -67,8 +67,8 @@ $$\begin{aligned} $$\begin{aligned} \text{Input:} \quad & x \in \mathbb{R}^{B \times S \times D} \\ -\text{Attention Block:} \quad & \text{attn\_out} = x + \text{Attention}(\text{RMSNorm}(x)) \\ -\text{FFN Block:} \quad & \text{output} = \text{attn\_out} + \text{SwiGLU}(\text{RMSNorm}(\text{attn\_out})) \\ +\text{Attention Block:} \quad & \text{attn}_\text{out} = x + \text{Attention}(\text{RMSNorm}(x)) \\ +\text{FFN Block:} \quad & \text{output} = \text{attn}_\text{out} + \text{SwiGLU}(\text{RMSNorm}(\text{attn}_\text{out})) \\ \text{All in One Kernel!} \quad & \text{Eliminates } 11 \text{ intermediate memory operations} \end{aligned}$$ diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/launch_performance_study.sh b/MLExamples/TinyTransformer/version4_pytorch_sdpa/launch_performance_study.sh index 714bddce..c06e8ac2 100755 --- a/MLExamples/TinyTransformer/version4_pytorch_sdpa/launch_performance_study.sh +++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/launch_performance_study.sh @@ -86,7 +86,7 @@ mkdir -p "$OUTPUT_DIR" # Display configuration echo "================================================================================" -echo "CASTILLE AI WORKSHOP - VERSION 4 PYTORCH SDPA PERFORMANCE STUDY" +echo "CASTIEL AI WORKSHOP - VERSION 4 PYTORCH SDPA PERFORMANCE STUDY" echo " Library-Optimized Approach: PyTorch SDPA (Same Performance as V3)" echo "================================================================================" echo "" diff --git a/MLExamples/TinyTransformer/version4_pytorch_sdpa/tiny_llama_v4.py b/MLExamples/TinyTransformer/version4_pytorch_sdpa/tiny_llama_v4.py index 4fd5bb53..74465dc9 100644 --- a/MLExamples/TinyTransformer/version4_pytorch_sdpa/tiny_llama_v4.py +++ b/MLExamples/TinyTransformer/version4_pytorch_sdpa/tiny_llama_v4.py @@ -759,7 +759,7 @@ def train_ultra_fused_model( Training mode for ultra-fused model with comprehensive version comparison. """ print("=" * 80) - print("CASTILLE AI WORKSHOP - VERSION 4: ULTRA-FUSED TRITON") + print("CASTIEL AI WORKSHOP - VERSION 4: ULTRA-FUSED TRITON") print(" Maximum Performance Through Aggressive Kernel Fusion") print("=" * 80)