diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
new file mode 100644
index 0000000..fe596a2
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Diagnose Prompt Module for Hardware Bottleneck Analysis.
+
+"""
+
+from .gpu_specs import get_gpu_specs
+
+
+__all__ = ["get_gpu_specs"]
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py
new file mode 100644
index 0000000..3e15846
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU Specifications Database for Bottleneck Analysis
+
+This module provides GPU hardware specifications needed for performance analysis
+and bottleneck identification. It includes peak compute performance, memory bandwidth,
+cache sizes, and SM counts for common NVIDIA GPUs.
+
+"""
+
+import logging
+from typing import Any
+
+from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import (
+    GPU_SPECS_DATABASE,
+)
+
+__all__ = ["GPU_SPECS_DATABASE", "get_gpu_specs"]
+
+logger = logging.getLogger(__name__)
+
+
+def get_gpu_specs(gpu_name: str) -> dict[str, Any] | None:
+    """
+    Get GPU specifications for bottleneck analysis.
+
+    This function returns hardware specifications needed for performance analysis,
+    including peak compute performance, memory bandwidth, cache sizes, and SM counts.
+
+    Args:
+        gpu_name: GPU name. Must exactly match a key in GPU_SPECS_DATABASE.
+
+    Returns:
+        Dictionary with GPU specifications, or None if GPU is not in the database.
+        When successful, contains:
+        - name: GPU name
+        - architecture: GPU architecture (e.g., "Ampere", "Hopper")
+        - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS
+        - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS
+        - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported)
+        - peak_memory_bw_gbps: Peak memory bandwidth in GB/s
+        - sm_count: Number of streaming multiprocessors
+        - max_threads_per_sm: Maximum threads per SM
+        - l1_cache_kb: L1 cache size in KB per SM
+        - l2_cache_mb: Total L2 cache size in MB
+        - memory_gb: Total GPU memory in GB
+        - memory_type: Memory type (e.g., "HBM2e", "GDDR6X")
+
+    Examples:
+        >>> specs = get_gpu_specs("NVIDIA A100 SXM4 80GB")
+        >>> if specs:
+        ...     print(f"SM Count: {specs['sm_count']}")
+    """
+    if gpu_name in GPU_SPECS_DATABASE:
+        return GPU_SPECS_DATABASE[gpu_name].copy()
+
+    logger.warning(
+        "Unknown GPU: '%s'. Disable Optimization. Available GPUs: %s",
+        gpu_name,
+        ", ".join(GPU_SPECS_DATABASE.keys()),
+    )
+    return None
+
+
+if __name__ == "__main__":
+    print("GPU Specifications Module")
+    print("=" * 60)
+
+    # Show all available GPUs
+    print("Available GPU specifications in database:")
+    for gpu_name in sorted(GPU_SPECS_DATABASE.keys()):
+        print(f"  - {gpu_name}")
+
+    # Example usage
+    print(f"\n{'=' * 60}")
+    example_gpu = "NVIDIA A100 SXM4 80GB"
+    specs = get_gpu_specs(example_gpu)
+    if specs:
+        print(f"\nExample specs for {example_gpu}:")
+        print(f"  - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s")
+        print(f"  - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS")
+        print(f"  - SM Count: {specs['sm_count']}")
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
new file mode 100644
index 0000000..cbc616d
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU Specifications Database - Updated with Specific SKUs
+
+This module contains the GPU hardware specifications database used for
+performance analysis and bottleneck identification. Updated to include
+specific SKU variants for multi-SKU GPUs like A100 and H100.
+
+Sources:
+- NVIDIA official specifications and datasheets
+- TechPowerUp GPU Database
+- Manufacturer datasheets
+
+Last Updated: January 2026
+"""
+
+GPU_SPECS_DATABASE: dict[str, dict[str, object]] = {
+    # NVIDIA A100 SKUs - SXM4 Variants
+    "NVIDIA A100 SXM4 40GB": {
+        "name": "NVIDIA A100 SXM4 40GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1555,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 40,
+        "memory_type": "HBM2e",
+        "form_factor": "SXM4",
+        "tdp_w": 400,
+    },
+    "NVIDIA A100 SXM4 80GB": {
+        "name": "NVIDIA A100 SXM4 80GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 2039,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "SXM4",
+        "tdp_w": 400,
+    },
+    # NVIDIA A100 SKUs - PCIe Variants
+    "NVIDIA A100 PCIe 40GB": {
+        "name": "NVIDIA A100 PCIe 40GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1555,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 40,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 250,
+    },
+    "NVIDIA A100 PCIe 80GB": {
+        "name": "NVIDIA A100 PCIe 80GB",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,  # Without sparsity
+        "peak_bf16_tflops": 312.0,  # Without sparsity
+        "peak_memory_bw_gbps": 1935,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 300,
+    },
+    # NVIDIA H100 SKUs - SXM5 Variant
+    "NVIDIA H100 SXM5 80GB": {
+        "name": "NVIDIA H100 SXM5 80GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 67.0,
+        "peak_fp16_tflops": 1979.0,  # Without sparsity
+        "peak_bf16_tflops": 1979.0,  # Without sparsity
+        "peak_memory_bw_gbps": 3350,
+        "sm_count": 132,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 80,
+        "memory_type": "HBM3",
+        "form_factor": "SXM5",
+        "tdp_w": 700,
+    },
+    # NVIDIA H100 SKUs - PCIe Variant
+    "NVIDIA H100 PCIe 80GB": {
+        "name": "NVIDIA H100 PCIe 80GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 51.0,
+        "peak_fp16_tflops": 1513.0,  # Without sparsity
+        "peak_bf16_tflops": 1513.0,  # Without sparsity
+        "peak_memory_bw_gbps": 2000,
+        "sm_count": 114,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 80,
+        "memory_type": "HBM2e",
+        "form_factor": "PCIe",
+        "tdp_w": 350,
+    },
+    # NVIDIA H100 SKUs - NVL Variant (for LLM inference)
+    "NVIDIA H100 NVL 94GB": {
+        "name": "NVIDIA H100 NVL 94GB",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 60.0,
+        "peak_fp16_tflops": 1671.0,  # Without sparsity
+        "peak_bf16_tflops": 1671.0,  # Without sparsity
+        "peak_memory_bw_gbps": 3900,
+        "sm_count": 132,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 94,
+        "memory_type": "HBM3",
+        "form_factor": "PCIe",
+        "tdp_w": 400,
+    },
+    # NVIDIA RTX 4090
+    "NVIDIA RTX 4090": {
+        "name": "NVIDIA RTX 4090",
+        "architecture": "Ada Lovelace",
+        "peak_fp32_tflops": 82.58,
+        "peak_fp16_tflops": 82.58,
+        "peak_bf16_tflops": 82.58,
+        "peak_memory_bw_gbps": 1008,
+        "sm_count": 128,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 72,
+        "memory_gb": 24,
+        "memory_type": "GDDR6X",
+        "form_factor": "PCIe",
+        "tdp_w": 450,
+    },
+    # NVIDIA RTX 5080
+    "NVIDIA RTX 5080": {
+        "name": "NVIDIA RTX 5080",
+        "architecture": "Blackwell",
+        "peak_fp32_tflops": 56.28,
+        "peak_fp16_tflops": 56.28,
+        "peak_bf16_tflops": 56.28,
+        "peak_memory_bw_gbps": 960,
+        "sm_count": 84,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 64,
+        "memory_gb": 16,
+        "memory_type": "GDDR7",
+        "form_factor": "PCIe",
+        "tdp_w": 360,
+    },
+}
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py
new file mode 100644
index 0000000..64d1d67
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/metric_schema.py
@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Metric Schema Definitions for NCU Profiling and GPU Specifications.
+
+This module provides the single source of truth for:
+- NCU profiling metric definitions (keys, labels, units)
+- GPU specification field definitions
+
+Schema Format: List of tuples (display_label, key, unit_suffix)
+- display_label: Human-readable name shown in prompts
+- key: NCU metric key or GPU spec dictionary key
+- unit_suffix: Unit to append after value (e.g., "%", " GB/s", " bytes")
+"""
+
+from typing import Dict, List, Tuple
+
+# Type alias for metric definition: (label, key, unit)
+MetricDef = Tuple[str, str, str]
+
+# =============================================================================
+# GPU Specification Fields
+# =============================================================================
+
+GPU_SPEC_FIELDS: List[MetricDef] = [
+    ("Name", "name", ""),
+    ("Architecture", "architecture", ""),
+    ("Peak Memory Bandwidth", "peak_memory_bw_gbps", " GB/s"),
+    ("Peak FP32 Performance", "peak_fp32_tflops", " TFLOPS"),
+    ("Peak FP16 Performance", "peak_fp16_tflops", " TFLOPS"),
+    ("SM Count", "sm_count", ""),
+    ("Max Threads per SM", "max_threads_per_sm", ""),
+    ("L1 Cache per SM", "l1_cache_kb", " KB"),
+    ("L2 Cache (Total)", "l2_cache_mb", " MB"),
+]
+
+# Special case: Memory Size has two fields combined
+GPU_MEMORY_FIELDS: List[Tuple[str, str, str, str]] = [
+    # (label, size_key, type_key, size_unit)
+    ("Memory Size", "memory_gb", "memory_type", " GB"),
+]
+
+# =============================================================================
+# NCU Profiling Metric Sections
+# =============================================================================
+
+NCU_METRIC_SECTIONS: Dict[str, List[MetricDef]] = {
+    "SM & Compute Utilization": [
+        ("SM Cycles Active", "sm__cycles_active.avg", ""),
+        ("Warp Active", "sm__warps_active.avg.pct_of_peak_sustained_active", "%"),
+        ("Total Instructions Executed", "sm__inst_executed.sum", ""),
+        (
+            "Tensor Core Utilization",
+            "sm__inst_executed_pipe_tensor.avg.pct_of_peak_sustained_active",
+            "%",
+        ),
+        (
+            "Tensor Core Pipeline Active",
+            "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_elapsed",
+            "%",
+        ),
+    ],
+    "Memory Bandwidth & Cache": [
+        (
+            "DRAM Throughput",
+            "dram__throughput.avg.pct_of_peak_sustained_elapsed",
+            "%",
+        ),
+        ("DRAM Bandwidth", "dram__bytes.sum.per_second", " bytes/sec"),
+        (
+            "GPU DRAM Throughput",
+            "gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed",
+            "%",
+        ),
+        ("DRAM Bytes Read", "dram__bytes_read.sum", " bytes"),
+        ("DRAM Bytes Write", "dram__bytes_write.sum", " bytes"),
+        ("L1 Cache Hit Rate", "l1tex__t_sector_hit_rate.pct", "%"),
+        (
+            "L1 Throughput",
+            "l1tex__throughput.avg.pct_of_peak_sustained_active",
+            "%",
+        ),
+        ("L2 Cache Hit Rate", "lts__t_sector_hit_rate.pct", "%"),
+        (
+            "L2 Throughput",
+            "lts__throughput.avg.pct_of_peak_sustained_active",
+            "%",
+        ),
+    ],
+    "Memory Access Patterns": [
+        (
+            "Memory Coalescing",
+            "smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct",
+            "%",
+        ),
+        (
+            "Branch Uniformity",
+            "smsp__sass_average_branch_targets_threads_uniform.pct",
+            "%",
+        ),
+    ],
+    "Occupancy & Resources": [
+        ("Occupancy Limited By Blocks", "launch__occupancy_limit_blocks", ""),
+        ("Occupancy Limited By Registers", "launch__occupancy_limit_registers", ""),
+        (
+            "Occupancy Limited By Shared Memory",
+            "launch__occupancy_limit_shared_mem",
+            "",
+        ),
+        ("Registers per Thread", "launch__registers_per_thread", ""),
+        (
+            "Shared Memory per Block",
+            "launch__shared_mem_per_block_allocated",
+            " bytes",
+        ),
+    ],
+    "Stall Metrics (Warp Issue Stalls)": [
+        (
+            "Short Scoreboard Stalls",
+            "smsp__warp_issue_stalled_short_scoreboard_per_warp_active.pct",
+            "%",
+        ),
+        (
+            "Long Scoreboard Stalls",
+            "smsp__warp_issue_stalled_long_scoreboard_per_warp_active.pct",
+            "%",
+        ),
+        (
+            "Barrier Stalls",
+            "smsp__warp_issue_stalled_barrier_per_warp_active.pct",
+            "%",
+        ),
+        (
+            "Branch Resolving Stalls",
+            "smsp__warp_issue_stalled_branch_resolving_per_warp_active.pct",
+            "%",
+        ),
+    ],
+}
diff --git a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py
index 4ce8568..4b1bf83 100644
--- a/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py
+++ b/kernel_perf_agent/kernel_opt/profiler/ncu_profiler.py
@@ -255,12 +255,8 @@ def _apply_selection_policy(
     Returns:
         DataFrame with a single row based on the policy
     """
-    if df.empty:
+    if len(df) <= 1:
         return df
-
-    if len(df) == 1:
-        return df
-
     if policy == MetricSelectionPolicy.FIRST:
         return df.iloc[[0]]
     elif policy == MetricSelectionPolicy.LAST:
@@ -317,7 +313,7 @@ def load_ncu_metrics(
     extra_keep: Optional[Sequence[str]] = ("Kernel Name",),
     coerce_numeric: bool = True,
     name_list: Optional[Sequence[str]] = None,
-    select: Union[str, MetricSelectionPolicy] = MetricSelectionPolicy.LAST,
+    select: MetricSelectionPolicy = MetricSelectionPolicy.LAST,
 ) -> pd.DataFrame:
     """
     Load and parse NCU metrics from CSV file.
@@ -328,32 +324,19 @@ def load_ncu_metrics(
         extra_keep: Additional columns to keep (e.g., "Kernel Name")
         coerce_numeric: Convert metric values to numeric
         name_list: Filter by kernel names (substring match)
-        select: Selection policy when multiple rows per name.
-                Can be MetricSelectionPolicy enum or string ("first", "last", "max_cycles")
+        select: Selection policy when multiple rows per name
 
     Returns:
         DataFrame with parsed metrics
 
     Raises:
         FileNotFoundError: If CSV file not found
-        ValueError: If no requested columns found in CSV or invalid select value
+        ValueError: If no requested columns found in CSV
     """
     csv_path = Path(csv_path)
     if not csv_path.exists():
         raise FileNotFoundError(f"CSV not found: {csv_path}")
 
-    # Convert string to enum if needed
-    if isinstance(select, str):
-        try:
-            policy = MetricSelectionPolicy(select)
-        except ValueError:
-            raise ValueError(
-                f"Invalid select value: {select}. "
-                f"Must be one of: {[p.value for p in MetricSelectionPolicy]}"
-            )
-    else:
-        policy = select
-
     df = pd.read_csv(csv_path, comment="=", low_memory=False)
 
     metric_cols = list(columns) if columns is not None else METRIC_COLUMNS
@@ -383,14 +366,11 @@ def load_ncu_metrics(
             .apply(pd.to_numeric, errors="coerce")
         )
 
-    # Filter by kernel name list if provided
-    if name_list:
-        sub = _filter_by_kernel_names(sub, name_list, policy, keep_cols)
-    else:
-        # Apply selection to all rows if no name filter
-        sub = _apply_selection_policy(sub, policy)
-
-    return sub
+    return (
+        _filter_by_kernel_names(sub, name_list, select, keep_cols)
+        if name_list
+        else _apply_selection_policy(sub, select)
+    )
 
 
 def metrics_to_prompt(
diff --git a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py
index 733216c..fe1e84f 100644
--- a/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py
+++ b/triton_kernel_agent/opt_worker_component/benchmarking/kernel_subprocess.py
@@ -288,12 +288,11 @@ def main():
     args = _parse_args()
 
     device = torch.device(args.device)
-    dtype_map = {
+    dtype = {
         "float32": torch.float32,
         "float16": torch.float16,
         "bfloat16": torch.bfloat16,
-    }
-    dtype = dtype_map[args.dtype]
+    }[args.dtype]
 
     if not args.quiet:
         print("=" * 80)