meta-pytorch · kaiming-cheng · Feb 2, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Diagnose Prompt Module for Hardware Bottleneck Analysis.
+
+"""
+
+from .gpu_specs import get_gpu_specs
+
+
+__all__ = ["get_gpu_specs"]
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU Specifications Database for Bottleneck Analysis
+
+This module provides GPU hardware specifications needed for performance analysis
+and bottleneck identification. It includes peak compute performance, memory bandwidth,
+cache sizes, and SM counts for common NVIDIA GPUs.
+
+"""
+
+import logging
+from typing import Any
+
+from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import (
+    GPU_SPECS_DATABASE,
+)
+
+__all__ = ["GPU_SPECS_DATABASE", "get_gpu_specs"]
+
+logger = logging.getLogger(__name__)
+
+
+def get_gpu_specs(gpu_name: str) -> dict[str, Any] | None:
+    """
+    Get GPU specifications for bottleneck analysis.
+
+    This function returns hardware specifications needed for performance analysis,
+    including peak compute performance, memory bandwidth, cache sizes, and SM counts.
+
+    Args:
+        gpu_name: GPU name. Must exactly match a key in GPU_SPECS_DATABASE.
+
+    Returns:
+        Dictionary with GPU specifications, or None if GPU is not in the database.
+        When successful, contains:
+        - name: GPU name
+        - architecture: GPU architecture (e.g., "Ampere", "Hopper")
+        - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS
+        - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS
+        - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported)
+        - peak_memory_bw_gbps: Peak memory bandwidth in GB/s
+        - sm_count: Number of streaming multiprocessors
+        - max_threads_per_sm: Maximum threads per SM
+        - l1_cache_kb: L1 cache size in KB per SM
+        - l2_cache_mb: Total L2 cache size in MB
+        - memory_gb: Total GPU memory in GB
+        - memory_type: Memory type (e.g., "HBM2e", "GDDR6X")
+
+    Examples:
+        >>> specs = get_gpu_specs("NVIDIA A100 SXM4 80GB")
+        >>> if specs:
+        ...     print(f"SM Count: {specs['sm_count']}")
+    """
+    if gpu_name in GPU_SPECS_DATABASE:
+        return GPU_SPECS_DATABASE[gpu_name].copy()
+
+    logger.warning(
+        "Unknown GPU: '%s'. Disable Optimization. Available GPUs: %s",
+        gpu_name,
+        ", ".join(GPU_SPECS_DATABASE.keys()),
+    )
+    return None
+
+
+if __name__ == "__main__":
+    print("GPU Specifications Module")
+    print("=" * 60)
+
+    # Show all available GPUs
+    print("Available GPU specifications in database:")
+    for gpu_name in sorted(GPU_SPECS_DATABASE.keys()):
+        print(f"  - {gpu_name}")
+
+    # Example usage
+    print(f"\n{'=' * 60}")
+    example_gpu = "NVIDIA A100 SXM4 80GB"
+    specs = get_gpu_specs(example_gpu)
+    if specs:
+        print(f"\nExample specs for {example_gpu}:")
+        print(f"  - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s")
+        print(f"  - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS")
+        print(f"  - SM Count: {specs['sm_count']}")
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU Specifications Database - Updated with Specific SKUs
+
+This module contains the GPU hardware specifications database used for
+performance analysis and bottleneck identification. Updated to include
+specific SKU variants for multi-SKU GPUs like A100 and H100.
+
+Sources:
+- NVIDIA official specifications and datasheets
+- TechPowerUp GPU Database
+- Manufacturer datasheets
+
+Last Updated: January 2026
+"""
+
+GPU_SPECS_DATABASE: dict[str, dict[str, object]] = {
+    # NVIDIA A100 SKUs - SXM4 Variants
+    "NVIDIA A100 SXM4 40GB": {
+        "name": "NVIDIA A100 SXM4 40GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1555,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 40,
+        "memory_type": "HBM2e",
+        "form_factor": "SXM4",
+        "tdp_w": 400,
+    },
+    "NVIDIA A100 SXM4 80GB": {
+        "name": "NVIDIA A100 SXM4 80GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 2039,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "SXM4",
+        "tdp_w": 400,
+    },
+    # NVIDIA A100 SKUs - PCIe Variants
+    "NVIDIA A100 PCIe 40GB": {
+        "name": "NVIDIA A100 PCIe 40GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1555,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 40,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 250,
+    },
+    "NVIDIA A100 PCIe 80GB": {
+        "name": "NVIDIA A100 PCIe 80GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1935,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 300,
+    },
+    # NVIDIA H100 SKUs - SXM5 Variant
+    "NVIDIA H100 SXM5 80GB": {
+        "name": "NVIDIA H100 SXM5 80GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 67.0,
+        "peak_fp16_tflops": 1979.0,  # Without sparsity
+        "peak_bf16_tflops": 1979.0,  # Without sparsity
+        "peak_memory_bw_gbps": 3350,
+        "sm_count": 132,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 80,
+        "memory_type": "HBM3",
+        "form_factor": "SXM5",
+        "tdp_w": 700,
+    },
+    # NVIDIA H100 SKUs - PCIe Variant
+    "NVIDIA H100 PCIe 80GB": {
+        "name": "NVIDIA H100 PCIe 80GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 51.0,
+        "peak_fp16_tflops": 1513.0,  # Without sparsity
+        "peak_bf16_tflops": 1513.0,  # Without sparsity
+        "peak_memory_bw_gbps": 2000,
+        "sm_count": 114,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 350,
+    },
+    # NVIDIA H100 SKUs - NVL Variant (for LLM inference)
+    "NVIDIA H100 NVL 94GB": {
+        "name": "NVIDIA H100 NVL 94GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 60.0,
+        "peak_fp16_tflops": 1671.0,  # Without sparsity
+        "peak_bf16_tflops": 1671.0,  # Without sparsity
+        "peak_memory_bw_gbps": 3900,
+        "sm_count": 132,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 94,
+        "memory_type": "HBM3",
+        "form_factor": "PCIe",
+        "tdp_w": 400,
+    },
+    # NVIDIA RTX 4090
+    "NVIDIA RTX 4090": {
+        "name": "NVIDIA RTX 4090",
+        "architecture": "Ada Lovelace",
+        "peak_fp32_tflops": 82.58,
+        "peak_fp16_tflops": 82.58,
+        "peak_bf16_tflops": 82.58,
+        "peak_memory_bw_gbps": 1008,
+        "sm_count": 128,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 72,
+        "memory_gb": 24,
+        "memory_type": "GDDR6X",
+        "form_factor": "PCIe",
+        "tdp_w": 450,
+    },
+    # NVIDIA RTX 5080
+    "NVIDIA RTX 5080": {
+        "name": "NVIDIA RTX 5080",
+        "architecture": "Blackwell",
+        "peak_fp32_tflops": 56.28,
+        "peak_fp16_tflops": 56.28,
+        "peak_bf16_tflops": 56.28,
+        "peak_memory_bw_gbps": 960,
+        "sm_count": 84,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 64,
+        "memory_gb": 16,
+        "memory_type": "GDDR7",
+        "form_factor": "PCIe",
+        "tdp_w": 360,
+    },
+}