Skip to content
This repository was archived by the owner on Apr 23, 2026. It is now read-only.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 259 additions & 0 deletions pinecone_rag/preprocessor/git_issue_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""
GitHub issue preprocessor for Pinecone RAG (LLVM only).

Loads GitHub issue JSON files from ``data/github/**/issue/*.json`` (e.g.
``data/github/Clang/issue/84062.json``), extracts ``issue_info`` and comments,
and produces LangChain Documents for chunking and embedding. Only issues from the
LLVM GitHub organization (e.g. llvm/llvm-project) are included; others are skipped.

Each Document has metadata: title, url, author, number, state, state_reason,
created_at, updated_at, closed_at, and type ``"issue"``. Content is built from
labels, body, and comment threads.
"""

import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional

from langchain_core.documents import Document

from config import GitConfig
from preprocessor.utility import get_timestamp_from_date, validate_content_length

logger = logging.getLogger(__name__)


def _get_nested(data: Dict[str, Any], *keys: str, default: Any = None) -> Any:
"""Get a nested key from a dict by path; return default if any key is missing.

Args:
data: Root dictionary to traverse.
*keys: Sequence of keys to follow (e.g. "user", "login").
default: Value to return if the path is missing or a step is not a dict.

Returns:
The value at the key path, or default.
"""
for key in keys:
if not isinstance(data, dict):
return default
if key not in data:
return default
data = data[key]
return data
Comment thread
jonathanMLDev marked this conversation as resolved.


def _extract_labels(info: Dict[str, Any]) -> List[str]:
"""Extract label names from issue info.

Args:
info: GitHub issue object containing a "labels" array.

Returns:
List of label name strings (from each label's "name" field or str(label)).
"""
labels_raw = info.get("labels") or []
labels: List[str] = []
for lb in labels_raw:
if not lb:
continue
if isinstance(lb, dict):
name = (lb.get("name") or "").strip()
if name:
labels.append(name)
else:
labels.append(str(lb))
return labels


def _build_content_parts(
labels: List[str], body: str, comments: List[Any]
) -> List[str]:
"""Build list of content segments for the Document page_content.

Args:
labels: Label names to prefix (optional "Labels: ..." line).
body: Issue body text.
comments: List of comment dicts with body, user.login, created_at.

Returns:
List of strings to be joined: labels line (if any), body, then each
comment with a "--- Comment by {user} ({date}) ---" header.
"""
parts = [f"Labels: {', '.join(labels)}\n\n", body] if labels else [body]
for com in comments:
com_body = (com.get("body") or "").strip()
if not com_body:
continue
Comment thread
jonathanMLDev marked this conversation as resolved.
com_user = _get_nested(com, "user", "login", default="") or ""
com_created = com.get("created_at") or ""
parts.append(f"\n\n--- Comment by {com_user} ({com_created}) ---\n\n{com_body}")
return parts


def _info_to_document(
info: Dict[str, Any],
comments: List[Any],
json_path: Path,
min_content_length: int,
) -> Optional[Document]:
Comment thread
jonathanMLDev marked this conversation as resolved.
"""Build one Document from GitHub issue info and comments.

Args:
info: GitHub issue object (title, body, user, state, labels, etc.).
comments: List of comment dicts (body, user, created_at).
json_path: Path to the source JSON file (used for logging).
min_content_length: Minimum character length for content; shorter skips.

Returns:
A Document with combined content (labels, body, comments) and metadata,
or None if html_url is missing or content is too short.
"""
html_url = info.get("html_url", "").strip()
if not html_url:
return None

title = (info.get("title") or "").strip()
body = (info.get("body") or "").strip()
author = _get_nested(info, "user", "login", default="") or ""
created_at = get_timestamp_from_date(info.get("created_at"))
updated_at = get_timestamp_from_date(info.get("updated_at"))
closed_at = info.get("closed_at")
if closed_at:
closed_at = get_timestamp_from_date(closed_at)
else:
closed_at = 0.0
labels = _extract_labels(info)
state = info.get("state", "") or ""
state_reason = info.get("state_reason", "") or ""

number = info.get("number", -1)

content = "".join(_build_content_parts(labels, body, comments)).strip()
if not validate_content_length(content, min_length=min_content_length):
logger.debug("Skip %s: content too short", json_path.name)
return None

meta: Dict[str, Any] = {
"author": author or "",
"title": title or "",
"number": number or -1,
"url": html_url or "",
"created_at": created_at or 0.0,
"updated_at": updated_at or 0.0,
"closed_at": closed_at or 0.0,
"type": "issue",
"state": state or "",
"state_reason": state_reason or "",
}

return Document(page_content=content, metadata=meta)
Comment thread
jonathanMLDev marked this conversation as resolved.


def _issue_json_to_document(
json_path: Path,
data: Dict[str, Any],
min_content_length: int,
) -> Optional[Document]:
"""Build one Document from a GitHub issue JSON file (issue_info + comments).

Args:
json_path: Path to the JSON file (for logging).
data: Parsed JSON root; must contain "issue_info" and optionally "comments".
min_content_length: Minimum content length; shorter documents are skipped.

Returns:
A Document with type "issue", or None if issue_info is missing or invalid.
"""
info = data.get("issue_info")
if not info or not isinstance(info, dict):
logger.debug("Skip %s: no issue_info", json_path.name)
return None
comments = data.get("comments") or []
return _info_to_document(info, comments, json_path, min_content_length)


def _load_one_json(
json_path: Path,
min_content_length: int,
) -> Optional[Document]:
"""Load one JSON file and return an issue Document, or None.

Args:
json_path: Path to the JSON file.
min_content_length: Minimum content length; shorter documents are skipped.

Returns:
A Document for the issue, or None if the file is invalid, missing
issue_info, or content is too short.
"""
try:
raw = json_path.read_text(encoding="utf-8", errors="replace")
data = json.loads(raw)
except (json.JSONDecodeError, OSError) as e:
logger.debug("Skip %s: %s", json_path.name, e)
return None
Comment thread
jonathanMLDev marked this conversation as resolved.

if not isinstance(data, dict):
logger.debug("Skip %s: JSON root is not an object", json_path.name)
return None
if "issue_info" not in data:
logger.debug("Skip %s: no issue_info", json_path.name)
return None
return _issue_json_to_document(json_path, data, min_content_length)


class GitIssuePreprocessor:
"""Load LLVM GitHub issue JSON files and produce Documents.

Discovers all ``*.json`` under ``config.data_dir / "issue"`` (e.g.
``data/github/Clang/issue/*.json``), parses each as an issue with
issue_info and comments. Only issues from the LLVM GitHub organization
(e.g. llvm/llvm-project) are included; others are skipped. Returns a list
of LangChain Documents for RAG ingestion.
"""

def __init__(self, config: Optional[GitConfig] = None):
"""Initialize the preprocessor with optional GitConfig.

Args:
config: Git configuration (data_dir, min_content_length). If None,
uses default GitConfig().
"""
self.config = config or GitConfig()
self.data_dir = Path(self.config.data_dir) / "issue"
self.min_content_length = self.config.min_content_length

def load_documents(self, limit: Optional[int] = None) -> List[Document]:
"""Load issue JSON files from the configured issue directory and convert to Documents.

Args:
limit: Optional maximum number of JSON files to process (by sorted path).
If None, all discovered *.json files are processed.

Returns:
List of Documents (one per valid issue JSON file) with combined
content (labels, body, comments) and metadata. Skips invalid files
and those with content below min_content_length.
"""
if not self.data_dir.exists():
logger.warning("Git data dir does not exist: %s", self.data_dir)
return []

json_paths = sorted(self.data_dir.rglob("*.json"))
if limit is not None:
json_paths = json_paths[:limit]

documents: List[Document] = []
for json_path in json_paths:
doc = _load_one_json(json_path, self.min_content_length)
if doc is not None:
documents.append(doc)

logger.info(
"Loaded %d GitHub issue documents from %s",
len(documents),
self.data_dir,
)
return documents