From 636c9f4bb4f1b3ce1df9313e675ff3c3dae4ec53 Mon Sep 17 00:00:00 2001 From: mac Date: Mon, 23 Feb 2026 12:10:56 -0800 Subject: [PATCH 1/3] #86-add phabricator preprocessor for clang --- .../preprocessor/phabricator_preprocessort.py | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 pinecone_rag/preprocessor/phabricator_preprocessort.py diff --git a/pinecone_rag/preprocessor/phabricator_preprocessort.py b/pinecone_rag/preprocessor/phabricator_preprocessort.py new file mode 100644 index 0000000..5788a2e --- /dev/null +++ b/pinecone_rag/preprocessor/phabricator_preprocessort.py @@ -0,0 +1,150 @@ +""" +Phabricator PR-like preprocessor for Pinecone RAG. + +Reads markdown files under data/phabricator/** and builds one Document per file. +Expected markdown header: +- # D [Open|Closed] +- > Username: <author> +- > Created at: <date text> +- > Url: https://reviews.llvm.org/D<number> +""" + +import logging +import re +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +from langchain_core.documents import Document + +logger = logging.getLogger(__name__) + +_HEADER_RE = re.compile( + r"^#\s*D(?P<number>\d+)\s+(?P<title>.+?)\s+\[(?P<state>[^\]]+)\]\s*$" +) +_USERNAME_RE = re.compile(r"^>\s*Username:\s*(.+?)\s*$", re.MULTILINE) +_CREATED_AT_RE = re.compile(r"^>\s*Created at:\s*(.+?)\s*$", re.MULTILINE) +_URL_RE = re.compile(r"^>\s*Url:\s*(https?://\S+)\s*$", re.MULTILINE) +_COMMENT_RE = re.compile(r"^##\s*Comment\s+\d+", re.MULTILINE) + +_CLOSED_STATES = {"closed", "abandoned", "merged"} + + +def _is_valid_content(text: str, min_length: int) -> bool: + return bool(text and len(text.strip()) >= min_length) + + +def _parse_created_at_to_timestamp(value: str) -> float: + if not value: + return 0.0 + + patterns = [ + "%b %d %Y, %I:%M %p", # Jan 18 2023, 5:56 PM + "%b %d %Y, %H:%M", # Jan 18 2023, 17:56 + ] + for pattern in patterns: + try: + return datetime.strptime(value, pattern).timestamp() + except Exception: + continue + return 0.0 + + +def _extract_metadata(md_text: str, file_path: Path) -> Dict[str, Any]: + lines = md_text.splitlines() + first_line = lines[0].strip() if lines else "" + + header_match = _HEADER_RE.match(first_line) + if header_match: + number = int(header_match.group("number")) + title = header_match.group("title").strip() + state = header_match.group("state").strip() + else: + number = -1 + title = file_path.stem + state = "" + + user_match = _USERNAME_RE.search(md_text) + url_match = _URL_RE.search(md_text) + + author = user_match.group(1).strip() if user_match else "" + url = url_match.group(1).strip() if url_match else "" + + if not url and number > 0: + url = f"https://reviews.llvm.org/D{number}" + + # Collect all "Created at:" timestamps from PR header + all comments/reviews. + # The first match is the PR's own creation time; the maximum is the last activity. + all_timestamps = [ + _parse_created_at_to_timestamp(raw.strip()) + for raw in _CREATED_AT_RE.findall(md_text) + ] + valid_timestamps = [ts for ts in all_timestamps if ts > 0.0] + + created_at = valid_timestamps[0] if valid_timestamps else 0.0 + last_activity = max(valid_timestamps) if valid_timestamps else 0.0 + updated_at = last_activity + closed_at = last_activity if state.lower() in _CLOSED_STATES else 0.0 + + return { + "type": "pr-phabricator", + "number": number, + "title": title, + "url": url, + "author": author, + "state": state.lower(), + "state_reason": "", + "created_at": created_at, + "updated_at": updated_at, + "closed_at": closed_at, + } + + +def _load_pr_document(md_path: Path, min_content_length: int) -> Optional[Document]: + try: + content = md_path.read_text(encoding="utf-8", errors="replace").strip() + except OSError as exc: + logger.debug("Skip %s: %s", md_path.name, exc) + return None + + if not _is_valid_content(content, min_content_length): + logger.debug("Skip %s: content too short", md_path.name) + return None + + metadata = _extract_metadata(content, md_path) + return Document(page_content=content, metadata=metadata) + + +class PhabricatorPrPreprocessor: + """Load Phabricator markdown files from data/phabricator and produce Documents.""" + + def __init__( + self, + data_dir: str = "data/github/Clang/phabricator", + min_content_length: int = 10, + ): + self.data_dir = Path(data_dir) + self.min_content_length = min_content_length + + def load_documents(self, limit: Optional[int] = None) -> List[Document]: + """Load Phabricator markdown files from data/github/Clang/phabricator/**/*.md.""" + if not self.data_dir.exists(): + logger.warning("Phabricator data dir does not exist: %s", self.data_dir) + return [] + + md_paths = sorted(self.data_dir.rglob("*.md")) + if limit is not None: + md_paths = md_paths[:limit] + + documents: List[Document] = [] + for md_path in md_paths: + doc = _load_pr_document(md_path, self.min_content_length) + if doc is not None: + documents.append(doc) + + logger.info( + "Loaded %d Phabricator PR documents from %s", + len(documents), + self.data_dir, + ) + return documents From 443a7a46af1d24e1e7d08e4150b648b67fa56c51 Mon Sep 17 00:00:00 2001 From: mac <henryw910816@outlook.com> Date: Tue, 24 Feb 2026 09:26:26 -0800 Subject: [PATCH 2/3] #86-rename and fix some errors --- ...r_preprocessort.py => phabricator_preprocessor.py} | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) rename pinecone_rag/preprocessor/{phabricator_preprocessort.py => phabricator_preprocessor.py} (90%) diff --git a/pinecone_rag/preprocessor/phabricator_preprocessort.py b/pinecone_rag/preprocessor/phabricator_preprocessor.py similarity index 90% rename from pinecone_rag/preprocessor/phabricator_preprocessort.py rename to pinecone_rag/preprocessor/phabricator_preprocessor.py index 5788a2e..6f2b68b 100644 --- a/pinecone_rag/preprocessor/phabricator_preprocessort.py +++ b/pinecone_rag/preprocessor/phabricator_preprocessor.py @@ -11,7 +11,7 @@ import logging import re -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional @@ -25,7 +25,6 @@ _USERNAME_RE = re.compile(r"^>\s*Username:\s*(.+?)\s*$", re.MULTILINE) _CREATED_AT_RE = re.compile(r"^>\s*Created at:\s*(.+?)\s*$", re.MULTILINE) _URL_RE = re.compile(r"^>\s*Url:\s*(https?://\S+)\s*$", re.MULTILINE) -_COMMENT_RE = re.compile(r"^##\s*Comment\s+\d+", re.MULTILINE) _CLOSED_STATES = {"closed", "abandoned", "merged"} @@ -44,8 +43,10 @@ def _parse_created_at_to_timestamp(value: str) -> float: ] for pattern in patterns: try: - return datetime.strptime(value, pattern).timestamp() - except Exception: + dt = datetime.strptime(value, pattern).replace(tzinfo=timezone.utc) + return dt.timestamp() + except ValueError: + logger.debug("Date parse failed for pattern '%s': %s", pattern, value) continue return 0.0 @@ -81,7 +82,7 @@ def _extract_metadata(md_text: str, file_path: Path) -> Dict[str, Any]: ] valid_timestamps = [ts for ts in all_timestamps if ts > 0.0] - created_at = valid_timestamps[0] if valid_timestamps else 0.0 + created_at = min(valid_timestamps) if valid_timestamps else 0.0 last_activity = max(valid_timestamps) if valid_timestamps else 0.0 updated_at = last_activity closed_at = last_activity if state.lower() in _CLOSED_STATES else 0.0 From 8fd3c6b207374cfb047e3f90faef4e4cccde7e9c Mon Sep 17 00:00:00 2001 From: mac <henryw910816@outlook.com> Date: Tue, 24 Feb 2026 11:00:28 -0800 Subject: [PATCH 3/3] #86-add docstring --- .../preprocessor/phabricator_preprocessor.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pinecone_rag/preprocessor/phabricator_preprocessor.py b/pinecone_rag/preprocessor/phabricator_preprocessor.py index 6f2b68b..282af42 100644 --- a/pinecone_rag/preprocessor/phabricator_preprocessor.py +++ b/pinecone_rag/preprocessor/phabricator_preprocessor.py @@ -30,10 +30,12 @@ def _is_valid_content(text: str, min_length: int) -> bool: + """Return True if text is non-empty and has at least min_length characters (after strip).""" return bool(text and len(text.strip()) >= min_length) def _parse_created_at_to_timestamp(value: str) -> float: + """Parse Phabricator 'Created at' date string to Unix timestamp (UTC). Returns 0.0 on empty or parse failure.""" if not value: return 0.0 @@ -52,6 +54,7 @@ def _parse_created_at_to_timestamp(value: str) -> float: def _extract_metadata(md_text: str, file_path: Path) -> Dict[str, Any]: + """Parse markdown content for D-number, title, state, author, URL and timestamps; return metadata dict.""" lines = md_text.splitlines() first_line = lines[0].strip() if lines else "" @@ -102,6 +105,7 @@ def _extract_metadata(md_text: str, file_path: Path) -> Dict[str, Any]: def _load_pr_document(md_path: Path, min_content_length: int) -> Optional[Document]: + """Load one Phabricator markdown file and convert it into a Document with extracted metadata.""" try: content = md_path.read_text(encoding="utf-8", errors="replace").strip() except OSError as exc: @@ -117,13 +121,20 @@ def _load_pr_document(md_path: Path, min_content_length: int) -> Optional[Docume class PhabricatorPrPreprocessor: - """Load Phabricator markdown files from data/phabricator and produce Documents.""" + """ + Load Phabricator PR-style markdown files and produce LangChain Documents. + + Expects markdown with header lines for D-number, title, state, username, + created-at, and URL. Call load_documents() to scan the data directory and + return a list of Document instances with extracted metadata. + """ def __init__( self, data_dir: str = "data/github/Clang/phabricator", min_content_length: int = 10, ): + """Initialize with the directory containing Phabricator markdown files and minimum content length.""" self.data_dir = Path(data_dir) self.min_content_length = min_content_length