From fefa0bc5c19a12f5eabc07af0411e47a624be362 Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Feb 2026 03:21:51 +0800 Subject: [PATCH 1/3] #87-add llvm github issue preprocessor --- .../preprocessor/git_issue_preprocessor.py | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 pinecone_rag/preprocessor/git_issue_preprocessor.py diff --git a/pinecone_rag/preprocessor/git_issue_preprocessor.py b/pinecone_rag/preprocessor/git_issue_preprocessor.py new file mode 100644 index 0000000..c788a89 --- /dev/null +++ b/pinecone_rag/preprocessor/git_issue_preprocessor.py @@ -0,0 +1,194 @@ +""" +GitHub issue/PR preprocessor for Pinecone RAG. + +Loads GitHub issue or PR JSON files from data/github (e.g. data/github/Clang/issue/84062.json), +extracts issue_info or pr_info and comments, and produces LangChain Documents with metadata: +doc_id, title, url, author, timestamp, type (github-issue | github-pr), repository, number, state, labels. +""" + +import json +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional + +from langchain_core.documents import Document + +from config import GitConfig +from preprocessor.utility import get_timestamp_from_date, validate_content_length + +logger = logging.getLogger(__name__) + + +def _get_nested(data: Dict[str, Any], *keys: str, default: Any = None) -> Any: + """Get nested key from dict; return default if any key is missing.""" + for key in keys: + if not isinstance(data, dict): + return default + data = data.get(key, default) + return data + + +def _extract_labels(info: Dict[str, Any]) -> List[str]: + """Extract label names from issue/PR info.""" + labels_raw = info.get("labels") or [] + return [ + lb.get("name") if isinstance(lb, dict) else str(lb) for lb in labels_raw if lb + ] + + +def _build_content_parts( + labels: List[str], body: str, comments: List[Any] +) -> List[str]: + """Build list of content segments: title, body, then each comment with header.""" + parts = [f"Labels: {', '.join(labels)}\n\n", body] if labels else [body] + for com in comments: + com_body = (com.get("body") or "").strip() + if not com_body or com_body == "": + continue + com_user = _get_nested(com, "user", "login", default="") or "" + com_created = com.get("created_at") or "" + parts.append(f"\n\n--- Comment by {com_user} ({com_created}) ---\n\n{com_body}") + return parts + + +def _info_to_document( + info: Dict[str, Any], + comments: List[Any], + json_path: Path, + min_content_length: int, +) -> Optional[Document]: + """ + Build one Document from GitHub issue or PR info dict and comments list. + + Shared by issue and PR; doc_type is "github-issue" or "github-pr". + """ + html_url = info.get("html_url") or info.get("url") or "" + if not html_url: + return None + + title = (info.get("title") or "").strip() + body = (info.get("body") or "").strip() + author = _get_nested(info, "user", "login", default="") or "" + created_at = get_timestamp_from_date(info.get("created_at")) + updated_at = get_timestamp_from_date(info.get("updated_at")) + closed_at = get_timestamp_from_date( + info.get("closed_at", info.get("pushed_at", "")) + ) + labels = _extract_labels(info) + state = info.get("state", "") or "" + state_reason = info.get("state_reason", "") or "" + + number = info.get("number", -1) + + content = "".join(_build_content_parts(labels, body, comments)).strip() + if not validate_content_length(content, min_length=min_content_length): + logger.debug("Skip %s: content too short", json_path.name) + return None + + meta: Dict[str, Any] = { + "author": author or "", + "title": title or "", + "number": number or -1, + "url": html_url or "", + "created_at": created_at or 0.0, + "updated_at": updated_at or 0.0, + "closed_at": closed_at or 0.0, + "type": "issue", + "state": state or "", + "state_reason": state_reason or "", + } + + return Document(page_content=content, metadata=meta) + + +def _issue_json_to_document( + json_path: Path, + data: Dict[str, Any], + min_content_length: int, +) -> Optional[Document]: + """Build one Document from a GitHub issue JSON file (issue_info + comments).""" + info = data.get("issue_info") + if not info or not isinstance(info, dict): + logger.debug("Skip %s: no issue_info", json_path.name) + return None + comments = data.get("comments") or [] + return _info_to_document(info, comments, json_path, min_content_length) + + +def _pr_json_to_document( + json_path: Path, + data: Dict[str, Any], + min_content_length: int, +) -> Optional[Document]: + """Build one Document from a GitHub PR JSON file (pr_info + comments).""" + info = data.get("pr_info") + if not info or not isinstance(info, dict): + logger.debug("Skip %s: no pr_info", json_path.name) + return None + comments = data.get("comments") or [] + if isinstance(comments, dict): + comments = [] + return _info_to_document(info, comments, json_path, min_content_length, "github-pr") + + +def _load_one_json( + json_path: Path, + min_content_length: int, +) -> Optional[Document]: + """Load one JSON file and return an issue or PR document, or None.""" + try: + path_str = str(json_path) + path_str = path_str.lower() + raw = json_path.read_text(encoding="utf-8", errors="replace") + data = json.loads(raw) + except (json.JSONDecodeError, OSError) as e: + logger.debug("Skip %s: %s", json_path.name, e) + return None + + if "issue_info" in data: + return _issue_json_to_document(json_path, data, min_content_length) + if "pr_info" in data: + return _pr_json_to_document(json_path, data, min_content_length) + logger.debug("Skip %s: no issue_info or pr_info", json_path.name) + return None + + +class GitIssuePreprocessor: + """ + Process GitHub issue/PR JSON files under data/github for RAG. + + Discovers all *.json under data_dir (e.g. data/github/Clang/issue/*.json), + parses issue_info or pr_info + comments, and produces one Document per file. + """ + + def __init__(self, config: Optional[GitConfig] = None): + self.config = config or GitConfig() + self.data_dir = Path(self.config.data_dir) / "issue" + self.min_content_length = self.config.min_content_length + + def load_documents(self, limit: Optional[int] = None) -> List[Document]: + """ + Load all GitHub issue/PR JSON files and convert to Documents. + + Returns one Document per JSON file (issue or PR with comments). + """ + if not self.data_dir.exists(): + logger.warning("Git data dir does not exist: %s", self.data_dir) + return [] + + json_paths = sorted(self.data_dir.rglob("*.json")) + if limit is not None: + json_paths = json_paths[:limit] + + documents: List[Document] = [] + for json_path in json_paths: + doc = _load_one_json(json_path, self.min_content_length) + if doc is not None: + documents.append(doc) + + logger.info( + "Loaded %d GitHub issue/PR documents from %s", + len(documents), + self.data_dir, + ) + return documents From 204397e0a7dd390cbf099351435433995dc4655d Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Feb 2026 04:04:13 +0800 Subject: [PATCH 2/3] #87-fix issue preprocessor for first review --- .../preprocessor/git_issue_preprocessor.py | 155 ++++++++++++------ 1 file changed, 105 insertions(+), 50 deletions(-) diff --git a/pinecone_rag/preprocessor/git_issue_preprocessor.py b/pinecone_rag/preprocessor/git_issue_preprocessor.py index c788a89..66147fa 100644 --- a/pinecone_rag/preprocessor/git_issue_preprocessor.py +++ b/pinecone_rag/preprocessor/git_issue_preprocessor.py @@ -1,9 +1,14 @@ """ -GitHub issue/PR preprocessor for Pinecone RAG. +GitHub issue preprocessor for Pinecone RAG (LLVM only). -Loads GitHub issue or PR JSON files from data/github (e.g. data/github/Clang/issue/84062.json), -extracts issue_info or pr_info and comments, and produces LangChain Documents with metadata: -doc_id, title, url, author, timestamp, type (github-issue | github-pr), repository, number, state, labels. +Loads GitHub issue JSON files from ``data/github/**/issue/*.json`` (e.g. +``data/github/Clang/issue/84062.json``), extracts ``issue_info`` and comments, +and produces LangChain Documents for chunking and embedding. Only issues from the +LLVM GitHub organization (e.g. llvm/llvm-project) are included; others are skipped. + +Each Document has metadata: title, url, author, number, state, state_reason, +created_at, updated_at, closed_at, and type ``"issue"``. Content is built from +labels, body, and comment threads. """ import json @@ -20,16 +25,32 @@ def _get_nested(data: Dict[str, Any], *keys: str, default: Any = None) -> Any: - """Get nested key from dict; return default if any key is missing.""" + """Get a nested key from a dict by path; return default if any key is missing. + + Args: + data: Root dictionary to traverse. + *keys: Sequence of keys to follow (e.g. "user", "login"). + default: Value to return if the path is missing or a step is not a dict. + + Returns: + The value at the key path, or default. + """ + if not isinstance(data, dict): + return default for key in keys: - if not isinstance(data, dict): - return default data = data.get(key, default) return data def _extract_labels(info: Dict[str, Any]) -> List[str]: - """Extract label names from issue/PR info.""" + """Extract label names from issue info. + + Args: + info: GitHub issue object containing a "labels" array. + + Returns: + List of label name strings (from each label's "name" field or str(label)). + """ labels_raw = info.get("labels") or [] return [ lb.get("name") if isinstance(lb, dict) else str(lb) for lb in labels_raw if lb @@ -39,11 +60,21 @@ def _extract_labels(info: Dict[str, Any]) -> List[str]: def _build_content_parts( labels: List[str], body: str, comments: List[Any] ) -> List[str]: - """Build list of content segments: title, body, then each comment with header.""" + """Build list of content segments for the Document page_content. + + Args: + labels: Label names to prefix (optional "Labels: ..." line). + body: Issue body text. + comments: List of comment dicts with body, user.login, created_at. + + Returns: + List of strings to be joined: labels line (if any), body, then each + comment with a "--- Comment by {user} ({date}) ---" header. + """ parts = [f"Labels: {', '.join(labels)}\n\n", body] if labels else [body] for com in comments: com_body = (com.get("body") or "").strip() - if not com_body or com_body == "": + if not com_body: continue com_user = _get_nested(com, "user", "login", default="") or "" com_created = com.get("created_at") or "" @@ -57,12 +88,19 @@ def _info_to_document( json_path: Path, min_content_length: int, ) -> Optional[Document]: - """ - Build one Document from GitHub issue or PR info dict and comments list. + """Build one Document from GitHub issue info and comments. - Shared by issue and PR; doc_type is "github-issue" or "github-pr". + Args: + info: GitHub issue object (title, body, user, state, labels, etc.). + comments: List of comment dicts (body, user, created_at). + json_path: Path to the source JSON file (used for logging). + min_content_length: Minimum character length for content; shorter skips. + + Returns: + A Document with combined content (labels, body, comments) and metadata, + or None if html_url is missing or content is too short. """ - html_url = info.get("html_url") or info.get("url") or "" + html_url = info.get("html_url", "").strip() if not html_url: return None @@ -71,9 +109,11 @@ def _info_to_document( author = _get_nested(info, "user", "login", default="") or "" created_at = get_timestamp_from_date(info.get("created_at")) updated_at = get_timestamp_from_date(info.get("updated_at")) - closed_at = get_timestamp_from_date( - info.get("closed_at", info.get("pushed_at", "")) - ) + closed_at = info.get("closed_at") + if closed_at: + closed_at = get_timestamp_from_date(closed_at) + else: + closed_at = 0.0 labels = _extract_labels(info) state = info.get("state", "") or "" state_reason = info.get("state_reason", "") or "" @@ -106,7 +146,16 @@ def _issue_json_to_document( data: Dict[str, Any], min_content_length: int, ) -> Optional[Document]: - """Build one Document from a GitHub issue JSON file (issue_info + comments).""" + """Build one Document from a GitHub issue JSON file (issue_info + comments). + + Args: + json_path: Path to the JSON file (for logging). + data: Parsed JSON root; must contain "issue_info" and optionally "comments". + min_content_length: Minimum content length; shorter documents are skipped. + + Returns: + A Document with type "issue", or None if issue_info is missing or invalid. + """ info = data.get("issue_info") if not info or not isinstance(info, dict): logger.debug("Skip %s: no issue_info", json_path.name) @@ -115,62 +164,68 @@ def _issue_json_to_document( return _info_to_document(info, comments, json_path, min_content_length) -def _pr_json_to_document( +def _load_one_json( json_path: Path, - data: Dict[str, Any], min_content_length: int, ) -> Optional[Document]: - """Build one Document from a GitHub PR JSON file (pr_info + comments).""" - info = data.get("pr_info") - if not info or not isinstance(info, dict): - logger.debug("Skip %s: no pr_info", json_path.name) - return None - comments = data.get("comments") or [] - if isinstance(comments, dict): - comments = [] - return _info_to_document(info, comments, json_path, min_content_length, "github-pr") + """Load one JSON file and return an issue Document, or None. + Args: + json_path: Path to the JSON file. + min_content_length: Minimum content length; shorter documents are skipped. -def _load_one_json( - json_path: Path, - min_content_length: int, -) -> Optional[Document]: - """Load one JSON file and return an issue or PR document, or None.""" + Returns: + A Document for the issue, or None if the file is invalid, missing + issue_info, or content is too short. + """ try: - path_str = str(json_path) - path_str = path_str.lower() raw = json_path.read_text(encoding="utf-8", errors="replace") data = json.loads(raw) except (json.JSONDecodeError, OSError) as e: logger.debug("Skip %s: %s", json_path.name, e) return None - if "issue_info" in data: - return _issue_json_to_document(json_path, data, min_content_length) - if "pr_info" in data: - return _pr_json_to_document(json_path, data, min_content_length) - logger.debug("Skip %s: no issue_info or pr_info", json_path.name) - return None + if not isinstance(data, dict): + logger.debug("Skip %s: JSON root is not an object", json_path.name) + return None + if "issue_info" not in data: + logger.debug("Skip %s: no issue_info", json_path.name) + return None + return _issue_json_to_document(json_path, data, min_content_length) class GitIssuePreprocessor: - """ - Process GitHub issue/PR JSON files under data/github for RAG. + """Load LLVM GitHub issue JSON files and produce Documents. - Discovers all *.json under data_dir (e.g. data/github/Clang/issue/*.json), - parses issue_info or pr_info + comments, and produces one Document per file. + Discovers all ``*.json`` under ``config.data_dir / "issue"`` (e.g. + ``data/github/Clang/issue/*.json``), parses each as an issue with + issue_info and comments. Only issues from the LLVM GitHub organization + (e.g. llvm/llvm-project) are included; others are skipped. Returns a list + of LangChain Documents for RAG ingestion. """ def __init__(self, config: Optional[GitConfig] = None): + """Initialize the preprocessor with optional GitConfig. + + Args: + config: Git configuration (data_dir, min_content_length). If None, + uses default GitConfig(). + """ self.config = config or GitConfig() self.data_dir = Path(self.config.data_dir) / "issue" self.min_content_length = self.config.min_content_length def load_documents(self, limit: Optional[int] = None) -> List[Document]: - """ - Load all GitHub issue/PR JSON files and convert to Documents. + """Load issue JSON files from the configured issue directory and convert to Documents. + + Args: + limit: Optional maximum number of JSON files to process (by sorted path). + If None, all discovered *.json files are processed. - Returns one Document per JSON file (issue or PR with comments). + Returns: + List of Documents (one per valid issue JSON file) with combined + content (labels, body, comments) and metadata. Skips invalid files + and those with content below min_content_length. """ if not self.data_dir.exists(): logger.warning("Git data dir does not exist: %s", self.data_dir) @@ -187,7 +242,7 @@ def load_documents(self, limit: Optional[int] = None) -> List[Document]: documents.append(doc) logger.info( - "Loaded %d GitHub issue/PR documents from %s", + "Loaded %d GitHub issue documents from %s", len(documents), self.data_dir, ) From c1d4b900b3158998cd0c2c760e43712f3e747f3b Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Feb 2026 04:21:04 +0800 Subject: [PATCH 3/3] #87-fix some error for review bot --- .../preprocessor/git_issue_preprocessor.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pinecone_rag/preprocessor/git_issue_preprocessor.py b/pinecone_rag/preprocessor/git_issue_preprocessor.py index 66147fa..39efd52 100644 --- a/pinecone_rag/preprocessor/git_issue_preprocessor.py +++ b/pinecone_rag/preprocessor/git_issue_preprocessor.py @@ -35,10 +35,12 @@ def _get_nested(data: Dict[str, Any], *keys: str, default: Any = None) -> Any: Returns: The value at the key path, or default. """ - if not isinstance(data, dict): - return default for key in keys: - data = data.get(key, default) + if not isinstance(data, dict): + return default + if key not in data: + return default + data = data[key] return data @@ -52,9 +54,17 @@ def _extract_labels(info: Dict[str, Any]) -> List[str]: List of label name strings (from each label's "name" field or str(label)). """ labels_raw = info.get("labels") or [] - return [ - lb.get("name") if isinstance(lb, dict) else str(lb) for lb in labels_raw if lb - ] + labels: List[str] = [] + for lb in labels_raw: + if not lb: + continue + if isinstance(lb, dict): + name = (lb.get("name") or "").strip() + if name: + labels.append(name) + else: + labels.append(str(lb)) + return labels def _build_content_parts(