From 40eff8b6a48f9dff5e95b800c0007b8a7c9c47aa Mon Sep 17 00:00:00 2001
From: Kaiming Cheng <kaimingcheng@fb.com>
Date: Fri, 30 Jan 2026 20:41:52 -0800
Subject: [PATCH 1/2] introduce judger prompt

---
 .../kernel_opt/diagnose_prompt/__init__.py    |  20 ++
 .../diagnose_prompt/judger_prompt.py          | 302 ++++++++++++++++++
 2 files changed, 322 insertions(+)
 create mode 100644 kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
 create mode 100644 kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py

diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
new file mode 100644
index 0000000..d8db477
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Diagnose Prompt Module for Hardware Bottleneck Analysis.
+
+"""
+
+__all__: list[str] = []
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py
new file mode 100644
index 0000000..67fbb13
--- /dev/null
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py
@@ -0,0 +1,302 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Bottleneck Analysis Prompt Builder
+
+Provides prompt templates and parsing utilities for LLM-based bottleneck analysis
+of NCU profiling metrics.
+
+Bottleneck Categories:
+- memory: Memory bandwidth is the limiting factor
+- compute: Compute throughput is the limiting factor
+- underutilized: Neither saturated (<60% both), indicating stalls/occupancy issues
+
+Metric definitions are in metric_schema.py.
+"""
+
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Any
+
+from kernel_perf_agent.kernel_opt.diagnose_prompt.metric_schema import (
+    GPU_MEMORY_FIELDS,
+    GPU_SPEC_FIELDS,
+    NCU_METRIC_SECTIONS,
+)
+from kernel_perf_agent.kernel_opt.roofline.ncu_roofline import RooflineResult
+
+BOTTLENECK_CATEGORIES = {"memory", "compute", "underutilized"}
+
+
+# =============================================================================
+# Data Classes
+# =============================================================================
+
+
+@dataclass
+class BottleneckResult:
+    """A single bottleneck analysis."""
+
+    category: str
+    summary: str
+    reasoning: str
+    root_causes: list[dict[str, Any]] = field(default_factory=list)
+    recommended_fixes: list[dict[str, Any]] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "category": self.category,
+            "summary": self.summary,
+            "reasoning": self.reasoning,
+            "root_causes": self.root_causes,
+            "recommended_fixes": self.recommended_fixes,
+        }
+
+
+# =============================================================================
+# Prompt Template
+# =============================================================================
+
+
+BOTTLENECK_PROMPT = """\
+You are a GPU performance expert analyzing Triton kernel profiling data.
+
+## Task
+Analyze the NCU metrics and identify {num_bottlenecks} performance bottleneck(s). For each, classify as:
+- **memory**: Memory bandwidth is the limiting factor
+- **compute**: Compute throughput is the limiting factor
+- **underutilized**: Neither saturated (<60% both), indicating stalls/occupancy issues
+
+## GPU Specifications
+{gpu_specs}
+
+## Roofline Analysis
+- Bottleneck: {roofline_bottleneck}
+- Compute SOL: {compute_sol:.1f}%
+- Memory SOL: {memory_sol:.1f}%
+- Efficiency: {efficiency:.1f}%
+- Headroom: {headroom:.1f}%
+- At Roofline: {at_roofline}
+- Tensor Cores: {uses_tc}
+- Warnings: {roofline_warnings}
+
+## NCU Metrics
+{ncu_metrics}
+
+## Kernel Code
+```python
+{kernel_code}
+```
+
+## Output (JSON array, no markdown fence)
+[
+    {{
+        "category": "memory" | "compute" | "underutilized",
+        "summary": "One-line summary",
+        "reasoning": "Explanation citing metrics",
+        "root_causes": [
+            {{
+                "cause": "Description",
+                "evidence": [{{"metric": "name", "value": 0.0, "interpretation": "meaning"}}]
+            }}
+        ],
+        "recommended_fixes": [
+            {{"fix": "Actionable instruction", "rationale": "Why"}}
+        ]
+    }}
+]
+
+Requirements:
+- Provide exactly {num_bottlenecks} bottleneck analysis object(s) in the array.
+- Order by importance (most critical first).
+- Each bottleneck should have exactly {num_causes} root cause(s) and {num_fixes} corresponding fix(es).
+- Keep summaries and reasoning concise and grounded in the provided metrics.
+"""
+
+
+# =============================================================================
+# Prompt Building
+# =============================================================================
+
+
+def _fmt_value(v: Any) -> str:
+    """Format a value for display in prompts."""
+    if isinstance(v, float):
+        return f"{v:.3g}"
+    if isinstance(v, int):
+        return str(v)
+    return str(v)
+
+
+def _format_gpu_specs(gpu_specs: dict[str, Any]) -> str:
+    """Format GPU specifications using metric_schema definitions."""
+    lines = []
+
+    for label, key, unit in GPU_SPEC_FIELDS:
+        value = gpu_specs.get(key)
+        if value is not None:
+            lines.append(f"- {label}: {_fmt_value(value)}{unit}")
+
+    for label, size_key, type_key, unit in GPU_MEMORY_FIELDS:
+        size = gpu_specs.get(size_key)
+        mem_type = gpu_specs.get(type_key, "")
+        if size is not None:
+            type_str = f" {mem_type}" if mem_type else ""
+            lines.append(f"- {label}: {_fmt_value(size)}{unit}{type_str}")
+
+    return "\n".join(lines) if lines else "N/A"
+
+
+def _format_ncu_metrics(ncu_metrics: dict[str, Any]) -> str:
+    """Format NCU metrics grouped by section using metric_schema definitions."""
+    lines = []
+
+    for section_name, metric_defs in NCU_METRIC_SECTIONS.items():
+        section_lines = []
+        for label, key, unit in metric_defs:
+            value = ncu_metrics.get(key)
+            if value is not None:
+                section_lines.append(f"  - {label}: {_fmt_value(value)}{unit}")
+
+        if section_lines:
+            lines.append(f"### {section_name}")
+            lines.extend(section_lines)
+
+    schema_keys = {key for _, key, _ in sum(NCU_METRIC_SECTIONS.values(), [])}
+    other_keys = sorted(set(ncu_metrics.keys()) - schema_keys)
+    if other_keys:
+        lines.append("### Other Metrics")
+        for key in other_keys:
+            value = ncu_metrics[key]
+            lines.append(f"  - {key}: {_fmt_value(value)}")
+
+    return "\n".join(lines) if lines else "N/A"
+
+
+def build_bottleneck_prompt(
+    kernel_code: str,
+    ncu_metrics: dict[str, Any],
+    roofline: RooflineResult,
+    gpu_specs: dict[str, Any],
+    num_bottlenecks: int = 2,
+    num_causes: int = 2,
+    num_fixes: int = 1,
+) -> str:
+    """Build the bottleneck analysis prompt for the LLM.
+
+    Args:
+        kernel_code: The Triton kernel source code.
+        ncu_metrics: NCU profiling metrics dictionary.
+        roofline: Roofline analysis result.
+        gpu_specs: GPU hardware specifications.
+        num_bottlenecks: Number of bottlenecks to request.
+        num_causes: Number of root causes per bottleneck.
+        num_fixes: Number of recommended fixes per bottleneck.
+
+    Returns:
+        Formatted prompt string for the LLM.
+    """
+    return BOTTLENECK_PROMPT.format(
+        num_bottlenecks=num_bottlenecks,
+        num_causes=num_causes,
+        num_fixes=num_fixes,
+        gpu_specs=_format_gpu_specs(gpu_specs),
+        roofline_bottleneck=roofline.bottleneck,
+        compute_sol=roofline.compute_sol_pct,
+        memory_sol=roofline.memory_sol_pct,
+        efficiency=roofline.efficiency_pct,
+        headroom=roofline.headroom_pct,
+        at_roofline="Yes" if roofline.at_roofline else "No",
+        uses_tc="Yes" if roofline.uses_tensor_cores else "No",
+        roofline_warnings="; ".join(roofline.warnings) or "None",
+        ncu_metrics=_format_ncu_metrics(ncu_metrics),
+        kernel_code=kernel_code,
+    )
+
+
+# =============================================================================
+# Response Parsing
+# =============================================================================
+
+
+def parse_bottleneck_response(
+    response: str,
+    fallback_category: str = "underutilized",
+) -> list[BottleneckResult]:
+    """Parse LLM response into a list of BottleneckResult.
+
+    Args:
+        response: Raw LLM response text.
+        fallback_category: Category to use if parsing fails.
+
+    Returns:
+        List of BottleneckResult. Empty list if parsing fails completely.
+    """
+    # Try to find JSON array
+    array_match = re.search(r"\[[\s\S]*\]", response)
+    if array_match:
+        try:
+            data = json.loads(array_match.group())
+            if isinstance(data, list):
+                return _parse_bottleneck_list(data, fallback_category)
+        except json.JSONDecodeError:
+            pass
+
+    # Fall back to single object
+    obj_match = re.search(r"\{[\s\S]*\}", response)
+    if obj_match:
+        try:
+            data = json.loads(obj_match.group())
+            if isinstance(data, dict):
+                return _parse_bottleneck_list([data], fallback_category)
+        except json.JSONDecodeError:
+            pass
+
+    return []
+
+
+def _parse_bottleneck_list(
+    items: list[dict[str, Any]],
+    fallback_category: str,
+) -> list[BottleneckResult]:
+    """Parse a list of bottleneck dicts into BottleneckResult objects."""
+    results = []
+    for item in items:
+        category = item.get("category", fallback_category)
+        if category not in BOTTLENECK_CATEGORIES:
+            category = fallback_category
+
+        root_causes = [
+            {"cause": rc.get("cause", "Unknown"), "evidence": rc.get("evidence", [])}
+            for rc in item.get("root_causes", [])
+        ]
+
+        fixes = [
+            {"fix": f.get("fix", ""), "rationale": f.get("rationale", "")}
+            for f in item.get("recommended_fixes", [])
+        ]
+
+        results.append(
+            BottleneckResult(
+                category=category,
+                summary=item.get("summary", f"{category}-bound"),
+                reasoning=item.get("reasoning", ""),
+                root_causes=root_causes,
+                recommended_fixes=fixes,
+            )
+        )
+
+    return results

From ff72d5179dfb10f6f97f93f369d6f3ea5b4fe1d6 Mon Sep 17 00:00:00 2001
From: Kaiming Cheng <kaimingcheng@fb.com>
Date: Sun, 1 Feb 2026 23:25:36 -0800
Subject: [PATCH 2/2] map fix to corrsponding cause instead of bottleneck

---
 .../diagnose_prompt/judger_prompt.py          | 45 ++++++++++++-------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py
index 67fbb13..55853b5 100644
--- a/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py
+++ b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompt.py
@@ -110,11 +110,11 @@ def to_dict(self) -> dict[str, Any]:
         "root_causes": [
             {{
                 "cause": "Description",
-                "evidence": [{{"metric": "name", "value": 0.0, "interpretation": "meaning"}}]
+                "evidence": [{{"metric": "name", "value": 0.0, "interpretation": "meaning"}}],
+                "fixes": [
+                    {{"fix": "Actionable instruction", "rationale": "Why"}}
+                ]
             }}
-        ],
-        "recommended_fixes": [
-            {{"fix": "Actionable instruction", "rationale": "Why"}}
         ]
     }}
 ]
@@ -122,7 +122,7 @@ def to_dict(self) -> dict[str, Any]:
 Requirements:
 - Provide exactly {num_bottlenecks} bottleneck analysis object(s) in the array.
 - Order by importance (most critical first).
-- Each bottleneck should have exactly {num_causes} root cause(s) and {num_fixes} corresponding fix(es).
+- Each bottleneck should have exactly {num_causes} root cause(s), each with {num_fixes} fix(es).
 - Keep summaries and reasoning concise and grounded in the provided metrics.
 """
 
@@ -191,7 +191,7 @@ def build_bottleneck_prompt(
     ncu_metrics: dict[str, Any],
     roofline: RooflineResult,
     gpu_specs: dict[str, Any],
-    num_bottlenecks: int = 2,
+    num_bottlenecks: int = 1,
     num_causes: int = 2,
     num_fixes: int = 1,
 ) -> str:
@@ -204,7 +204,7 @@ def build_bottleneck_prompt(
         gpu_specs: GPU hardware specifications.
         num_bottlenecks: Number of bottlenecks to request.
         num_causes: Number of root causes per bottleneck.
-        num_fixes: Number of recommended fixes per bottleneck.
+        num_fixes: Number of recommended fixes per root cause.
 
     Returns:
         Formatted prompt string for the LLM.
@@ -279,15 +279,28 @@ def _parse_bottleneck_list(
         if category not in BOTTLENECK_CATEGORIES:
             category = fallback_category
 
-        root_causes = [
-            {"cause": rc.get("cause", "Unknown"), "evidence": rc.get("evidence", [])}
-            for rc in item.get("root_causes", [])
-        ]
+        # Parse root causes with nested fixes
+        root_causes = []
+        all_fixes = []
+        for rc in item.get("root_causes", []):
+            cause_fixes = [
+                {"fix": f.get("fix", ""), "rationale": f.get("rationale", "")}
+                for f in rc.get("fixes", [])
+            ]
+            root_causes.append(
+                {
+                    "cause": rc.get("cause", "Unknown"),
+                    "evidence": rc.get("evidence", []),
+                    "fixes": cause_fixes,
+                }
+            )
+            all_fixes.extend(cause_fixes)
 
-        fixes = [
-            {"fix": f.get("fix", ""), "rationale": f.get("rationale", "")}
-            for f in item.get("recommended_fixes", [])
-        ]
+        # Also check for legacy top-level recommended_fixes
+        for f in item.get("recommended_fixes", []):
+            fix_entry = {"fix": f.get("fix", ""), "rationale": f.get("rationale", "")}
+            if fix_entry not in all_fixes:
+                all_fixes.append(fix_entry)
 
         results.append(
             BottleneckResult(
@@ -295,7 +308,7 @@ def _parse_bottleneck_list(
                 summary=item.get("summary", f"{category}-bound"),
                 reasoning=item.get("reasoning", ""),
                 root_causes=root_causes,
-                recommended_fixes=fixes,
+                recommended_fixes=all_fixes,
             )
         )