diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py new file mode 100644 index 0000000..fe596a2 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Diagnose Prompt Module for Hardware Bottleneck Analysis. + +""" + +from .gpu_specs import get_gpu_specs + + +__all__ = ["get_gpu_specs"] diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py new file mode 100644 index 0000000..3e15846 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPU Specifications Database for Bottleneck Analysis + +This module provides GPU hardware specifications needed for performance analysis +and bottleneck identification. It includes peak compute performance, memory bandwidth, +cache sizes, and SM counts for common NVIDIA GPUs. + +""" + +import logging +from typing import Any + +from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import ( + GPU_SPECS_DATABASE, +) + +__all__ = ["GPU_SPECS_DATABASE", "get_gpu_specs"] + +logger = logging.getLogger(__name__) + + +def get_gpu_specs(gpu_name: str) -> dict[str, Any] | None: + """ + Get GPU specifications for bottleneck analysis. + + This function returns hardware specifications needed for performance analysis, + including peak compute performance, memory bandwidth, cache sizes, and SM counts. + + Args: + gpu_name: GPU name. Must exactly match a key in GPU_SPECS_DATABASE. + + Returns: + Dictionary with GPU specifications, or None if GPU is not in the database. + When successful, contains: + - name: GPU name + - architecture: GPU architecture (e.g., "Ampere", "Hopper") + - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS + - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS + - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported) + - peak_memory_bw_gbps: Peak memory bandwidth in GB/s + - sm_count: Number of streaming multiprocessors + - max_threads_per_sm: Maximum threads per SM + - l1_cache_kb: L1 cache size in KB per SM + - l2_cache_mb: Total L2 cache size in MB + - memory_gb: Total GPU memory in GB + - memory_type: Memory type (e.g., "HBM2e", "GDDR6X") + + Examples: + >>> specs = get_gpu_specs("NVIDIA A100 SXM4 80GB") + >>> if specs: + ... print(f"SM Count: {specs['sm_count']}") + """ + if gpu_name in GPU_SPECS_DATABASE: + return GPU_SPECS_DATABASE[gpu_name].copy() + + logger.warning( + "Unknown GPU: '%s'. Disable Optimization. Available GPUs: %s", + gpu_name, + ", ".join(GPU_SPECS_DATABASE.keys()), + ) + return None + + +if __name__ == "__main__": + print("GPU Specifications Module") + print("=" * 60) + + # Show all available GPUs + print("Available GPU specifications in database:") + for gpu_name in sorted(GPU_SPECS_DATABASE.keys()): + print(f" - {gpu_name}") + + # Example usage + print(f"\n{'=' * 60}") + example_gpu = "NVIDIA A100 SXM4 80GB" + specs = get_gpu_specs(example_gpu) + if specs: + print(f"\nExample specs for {example_gpu}:") + print(f" - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s") + print(f" - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS") + print(f" - SM Count: {specs['sm_count']}") diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py new file mode 100644 index 0000000..cbc616d --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPU Specifications Database - Updated with Specific SKUs + +This module contains the GPU hardware specifications database used for +performance analysis and bottleneck identification. Updated to include +specific SKU variants for multi-SKU GPUs like A100 and H100. + +Sources: +- NVIDIA official specifications and datasheets +- TechPowerUp GPU Database +- Manufacturer datasheets + +Last Updated: January 2026 +""" + +GPU_SPECS_DATABASE: dict[str, dict[str, object]] = { + # NVIDIA A100 SKUs - SXM4 Variants + "NVIDIA A100 SXM4 40GB": { + "name": "NVIDIA A100 SXM4 40GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1555, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 40, + "memory_type": "HBM2e", + "form_factor": "SXM4", + "tdp_w": 400, + }, + "NVIDIA A100 SXM4 80GB": { + "name": "NVIDIA A100 SXM4 80GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 2039, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "SXM4", + "tdp_w": 400, + }, + # NVIDIA A100 SKUs - PCIe Variants + "NVIDIA A100 PCIe 40GB": { + "name": "NVIDIA A100 PCIe 40GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1555, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 40, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 250, + }, + "NVIDIA A100 PCIe 80GB": { + "name": "NVIDIA A100 PCIe 80GB", + "architecture": "Ampere", + "peak_fp32_tflops": 19.5, + "peak_fp16_tflops": 312.0, # Without sparsity + "peak_bf16_tflops": 312.0, # Without sparsity + "peak_memory_bw_gbps": 1935, + "sm_count": 108, + "max_threads_per_sm": 2048, + "l1_cache_kb": 192, + "l2_cache_mb": 40, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 300, + }, + # NVIDIA H100 SKUs - SXM5 Variant + "NVIDIA H100 SXM5 80GB": { + "name": "NVIDIA H100 SXM5 80GB", + "architecture": "Hopper", + "peak_fp32_tflops": 67.0, + "peak_fp16_tflops": 1979.0, # Without sparsity + "peak_bf16_tflops": 1979.0, # Without sparsity + "peak_memory_bw_gbps": 3350, + "sm_count": 132, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 80, + "memory_type": "HBM3", + "form_factor": "SXM5", + "tdp_w": 700, + }, + # NVIDIA H100 SKUs - PCIe Variant + "NVIDIA H100 PCIe 80GB": { + "name": "NVIDIA H100 PCIe 80GB", + "architecture": "Hopper", + "peak_fp32_tflops": 51.0, + "peak_fp16_tflops": 1513.0, # Without sparsity + "peak_bf16_tflops": 1513.0, # Without sparsity + "peak_memory_bw_gbps": 2000, + "sm_count": 114, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 80, + "memory_type": "HBM2e", + "form_factor": "PCIe", + "tdp_w": 350, + }, + # NVIDIA H100 SKUs - NVL Variant (for LLM inference) + "NVIDIA H100 NVL 94GB": { + "name": "NVIDIA H100 NVL 94GB", + "architecture": "Hopper", + "peak_fp32_tflops": 60.0, + "peak_fp16_tflops": 1671.0, # Without sparsity + "peak_bf16_tflops": 1671.0, # Without sparsity + "peak_memory_bw_gbps": 3900, + "sm_count": 132, + "max_threads_per_sm": 2048, + "l1_cache_kb": 256, + "l2_cache_mb": 50, + "memory_gb": 94, + "memory_type": "HBM3", + "form_factor": "PCIe", + "tdp_w": 400, + }, + # NVIDIA RTX 4090 + "NVIDIA RTX 4090": { + "name": "NVIDIA RTX 4090", + "architecture": "Ada Lovelace", + "peak_fp32_tflops": 82.58, + "peak_fp16_tflops": 82.58, + "peak_bf16_tflops": 82.58, + "peak_memory_bw_gbps": 1008, + "sm_count": 128, + "max_threads_per_sm": 1536, + "l1_cache_kb": 128, + "l2_cache_mb": 72, + "memory_gb": 24, + "memory_type": "GDDR6X", + "form_factor": "PCIe", + "tdp_w": 450, + }, + # NVIDIA RTX 5080 + "NVIDIA RTX 5080": { + "name": "NVIDIA RTX 5080", + "architecture": "Blackwell", + "peak_fp32_tflops": 56.28, + "peak_fp16_tflops": 56.28, + "peak_bf16_tflops": 56.28, + "peak_memory_bw_gbps": 960, + "sm_count": 84, + "max_threads_per_sm": 1536, + "l1_cache_kb": 128, + "l2_cache_mb": 64, + "memory_gb": 16, + "memory_type": "GDDR7", + "form_factor": "PCIe", + "tdp_w": 360, + }, +} diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py new file mode 100644 index 0000000..64d1d67 --- /dev/null +++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py @@ -0,0 +1,151 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Metric Schema Definitions for NCU Profiling and GPU Specifications. + +This module provides the single source of truth for: +- NCU profiling metric definitions (keys, labels, units) +- GPU specification field definitions + +Schema Format: List of tuples (display_label, key, unit_suffix) +- display_label: Human-readable name shown in prompts +- key: NCU metric key or GPU spec dictionary key +- unit_suffix: Unit to append after value (e.g., "%", " GB/s", " bytes") +""" + +from typing import Dict, List, Tuple + +# Type alias for metric definition: (label, key, unit) +MetricDef = Tuple[str, str, str] + +# ============================================================================= +# GPU Specification Fields +# ============================================================================= + +GPU_SPEC_FIELDS: List[MetricDef] = [ + ("Name", "name", ""), + ("Architecture", "architecture", ""), + ("Peak Memory Bandwidth", "peak_memory_bw_gbps", " GB/s"), + ("Peak FP32 Performance", "peak_fp32_tflops", " TFLOPS"), + ("Peak FP16 Performance", "peak_fp16_tflops", " TFLOPS"), + ("SM Count", "sm_count", ""), + ("Max Threads per SM", "max_threads_per_sm", ""), + ("L1 Cache per SM", "l1_cache_kb", " KB"), + ("L2 Cache (Total)", "l2_cache_mb", " MB"), +] + +# Special case: Memory Size has two fields combined +GPU_MEMORY_FIELDS: List[Tuple[str, str, str, str]] = [ + # (label, size_key, type_key, size_unit) + ("Memory Size", "memory_gb", "memory_type", " GB"), +] + +# ============================================================================= +# NCU Profiling Metric Sections +# ============================================================================= + +NCU_METRIC_SECTIONS: Dict[str, List[MetricDef]] = { + "SM & Compute Utilization": [ + ("SM Cycles Active", "sm__cycles_active.avg", ""), + ("Warp Active", "sm__warps_active.avg.pct_of_peak_sustained_active", "%"), + ("Total Instructions Executed", "sm__inst_executed.sum", ""), + ( + "Tensor Core Utilization", + "sm__inst_executed_pipe_tensor.avg.pct_of_peak_sustained_active", + "%", + ), + ( + "Tensor Core Pipeline Active", + "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ], + "Memory Bandwidth & Cache": [ + ( + "DRAM Throughput", + "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ("DRAM Bandwidth", "dram__bytes.sum.per_second", " bytes/sec"), + ( + "GPU DRAM Throughput", + "gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed", + "%", + ), + ("DRAM Bytes Read", "dram__bytes_read.sum", " bytes"), + ("DRAM Bytes Write", "dram__bytes_write.sum", " bytes"), + ("L1 Cache Hit Rate", "l1tex__t_sector_hit_rate.pct", "%"), + ( + "L1 Throughput", + "l1tex__throughput.avg.pct_of_peak_sustained_active", + "%", + ), + ("L2 Cache Hit Rate", "lts__t_sector_hit_rate.pct", "%"), + ( + "L2 Throughput", + "lts__throughput.avg.pct_of_peak_sustained_active", + "%", + ), + ], + "Memory Access Patterns": [ + ( + "Memory Coalescing", + "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct", + "%", + ), + ( + "Branch Uniformity", + "smsp__sass_average_branch_targets_threads_uniform.pct", + "%", + ), + ], + "Occupancy & Resources": [ + ("Occupancy Limited By Blocks", "launch__occupancy_limit_blocks", ""), + ("Occupancy Limited By Registers", "launch__occupancy_limit_registers", ""), + ( + "Occupancy Limited By Shared Memory", + "launch__occupancy_limit_shared_mem", + "", + ), + ("Registers per Thread", "launch__registers_per_thread", ""), + ( + "Shared Memory per Block", + "launch__shared_mem_per_block_allocated", + " bytes", + ), + ], + "Stall Metrics (Warp Issue Stalls)": [ + ( + "Short Scoreboard Stalls", + "smsp__warp_issue_stalled_short_scoreboard_per_warp_active.pct", + "%", + ), + ( + "Long Scoreboard Stalls", + "smsp__warp_issue_stalled_long_scoreboard_per_warp_active.pct", + "%", + ), + ( + "Barrier Stalls", + "smsp__warp_issue_stalled_barrier_per_warp_active.pct", + "%", + ), + ( + "Branch Resolving Stalls", + "smsp__warp_issue_stalled_branch_resolving_per_warp_active.pct", + "%", + ), + ], +} diff --git a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py index 4ce8568..4b1bf83 100644 --- a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py +++ b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py @@ -255,12 +255,8 @@ def _apply_selection_policy( Returns: DataFrame with a single row based on the policy """ - if df.empty: + if len(df) <= 1: return df - - if len(df) == 1: - return df - if policy == MetricSelectionPolicy.FIRST: return df.iloc[[0]] elif policy == MetricSelectionPolicy.LAST: @@ -317,7 +313,7 @@ def load_ncu_metrics( extra_keep: Optional[Sequence[str]] = ("Kernel Name",), coerce_numeric: bool = True, name_list: Optional[Sequence[str]] = None, - select: Union[str, MetricSelectionPolicy] = MetricSelectionPolicy.LAST, + select: MetricSelectionPolicy = MetricSelectionPolicy.LAST, ) -> pd.DataFrame: """ Load and parse NCU metrics from CSV file. @@ -328,32 +324,19 @@ def load_ncu_metrics( extra_keep: Additional columns to keep (e.g., "Kernel Name") coerce_numeric: Convert metric values to numeric name_list: Filter by kernel names (substring match) - select: Selection policy when multiple rows per name. - Can be MetricSelectionPolicy enum or string ("first", "last", "max_cycles") + select: Selection policy when multiple rows per name Returns: DataFrame with parsed metrics Raises: FileNotFoundError: If CSV file not found - ValueError: If no requested columns found in CSV or invalid select value + ValueError: If no requested columns found in CSV """ csv_path = Path(csv_path) if not csv_path.exists(): raise FileNotFoundError(f"CSV not found: {csv_path}") - # Convert string to enum if needed - if isinstance(select, str): - try: - policy = MetricSelectionPolicy(select) - except ValueError: - raise ValueError( - f"Invalid select value: {select}. " - f"Must be one of: {[p.value for p in MetricSelectionPolicy]}" - ) - else: - policy = select - df = pd.read_csv(csv_path, comment="=", low_memory=False) metric_cols = list(columns) if columns is not None else METRIC_COLUMNS @@ -383,14 +366,11 @@ def load_ncu_metrics( .apply(pd.to_numeric, errors="coerce") ) - # Filter by kernel name list if provided - if name_list: - sub = _filter_by_kernel_names(sub, name_list, policy, keep_cols) - else: - # Apply selection to all rows if no name filter - sub = _apply_selection_policy(sub, policy) - - return sub + return ( + _filter_by_kernel_names(sub, name_list, select, keep_cols) + if name_list + else _apply_selection_policy(sub, select) + ) def metrics_to_prompt( diff --git a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py index 733216c..fe1e84f 100644 --- a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py +++ b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py @@ -288,12 +288,11 @@ def main(): args = _parse_args() device = torch.device(args.device) - dtype_map = { + dtype = { "float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16, - } - dtype = dtype_map[args.dtype] + }[args.dtype] if not args.quiet: print("=" * 80)