diff --git a/.env.example b/.env.example index 4c3c5b1e..468461a2 100644 --- a/.env.example +++ b/.env.example @@ -76,19 +76,6 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Slack webhook URL (get from Slack: https://api.slack.com/messaging/webhooks) # SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL -# ============================================================================= -# Clang GitHub Tracker -# ============================================================================= -# GitHub repo to sync (default: llvm/llvm-project). -# CLANG_GITHUB_OWNER=llvm -# CLANG_GITHUB_REPO=llvm-project -# -# Private repo for Markdown export (optional). -# Issues/PRs are exported to: issues/YYYY/YYYY-MM/#N - title.md -# If unset, upload is skipped and an error is logged. -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER=your-org -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME=your-private-repo -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH=main # ============================================================================= # GitHub tokens (multiple use cases) @@ -108,6 +95,13 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # GitHub repo to sync (default: llvm/llvm-project). # CLANG_GITHUB_OWNER=llvm # CLANG_GITHUB_REPO=llvm-project +# Markdown publish target (optional; see also Clang section above). +# CLANG_GITHUB_CONTEXT_REPO_OWNER=your-org +# CLANG_GITHUB_CONTEXT_REPO_NAME=your-repo +# CLANG_GITHUB_CONTEXT_REPO_BRANCH=main +# If that repo is private: set GITHUB_TOKEN_WRITE to a PAT that can read+push it +# (classic: repo scope; fine-grained: grant this repository). Publish uses the +# write token, not GITHUB_TOKENS_SCRAPING. # Pinecone sync (run_cppa_pinecone_sync) — app_type and namespace when triggering from this app. # CLANG_GITHUB_PINECONE_APP_TYPE=github-clang # CLANG_GITHUB_PINECONE_NAMESPACE=github-clang @@ -170,17 +164,18 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # REPO_COUNT_LANGUAGES=C++,Python,Rust # ============================================================================= -# Boost Library Usage Dashboard (optional; for --publish) +# Boost Library Usage Dashboard # ============================================================================= -# When set, run_boost_library_usage_dashboard --publish uses a persistent clone -# at raw/boost_library_usage_dashboard// (clone if missing, pull, copy, push). +# Target repo for publishing (run_boost_library_usage_dashboard without --skip-publish). +# Clone/pull/push uses GITHUB_TOKEN_WRITE (see GitHub tokens above). # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER=your-org # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO=your-dashboard-repo -# Token for clone/pull/push (defaults to GITHUB_TOKEN_WRITE if unset) -# BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN=ghp_xxxx -# Branch to publish to # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH=main +# Git commit author identity used when publishing (defaults shown) +# GIT_AUTHOR_NAME=unknown +# GIT_AUTHOR_EMAIL=unknown@noreply.github.com + # ============================================================================= # Workspace (optional; default: project_root/workspace) # ============================================================================= @@ -256,3 +251,15 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Path to context repository (where markdown files are exported) # DISCORD_CONTEXT_REPO_PATH=F:\boost\discord-cplusplus-together-context + +# ============================================================================= +# YouTube (cppa_youtube_script_tracker) +# ============================================================================= +# YouTube Data API v3 key (console.cloud.google.com → APIs & Services → Credentials) +# YOUTUBE_API_KEY=... + +# Pinecone namespace for YouTube video/transcript sync (default: youtube-scripts) +# YOUTUBE_PINECONE_NAMESPACE=youtube-scripts + +# Earliest published_at to use when DB is empty (ISO 8601, e.g. 2015-01-01T00:00:00Z) +# YOUTUBE_DEFAULT_PUBLISHED_AFTER=2015-01-01T00:00:00Z diff --git a/.github/workflows/deploy-script/deploy.sh b/.github/workflows/deploy-script/deploy.sh index 716d1c69..ea5ac398 100644 --- a/.github/workflows/deploy-script/deploy.sh +++ b/.github/workflows/deploy-script/deploy.sh @@ -60,4 +60,7 @@ until make health >/dev/null 2>&1; do done log "Stack is healthy." +log "Sending startup notification..." +DEPLOY_BRANCH="$BRANCH" make notify || log "WARNING: Startup notification failed (non-fatal)." + log "Deploy completed." diff --git a/.gitignore b/.gitignore index aeb51c0a..d9be1a38 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ discord_activity_tracker/tools/ config/boost_collector_schedule.yaml # temp files temp/ +nul diff --git a/Dockerfile b/Dockerfile index 5b7b51c4..8a26be96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,6 +33,10 @@ RUN chmod +x /app/docker-entrypoint.sh # Entrypoint runs as root, chowns mounted dirs, then exec's CMD as appuser via gosu RUN useradd --create-home appuser && chown -R appuser /app +# Git 2.35+ blocks repos when directory owner != current user; bind mounts often +# disagree (e.g. Docker Desktop on Windows). System config applies to root and appuser +# (e.g. docker exec as root vs gosu appuser in entrypoint). +RUN git config --system --add safe.directory '/app/workspace/*' ENTRYPOINT ["/app/docker-entrypoint.sh"] # Container starts as root so entrypoint can chown; CMD runs as appuser via gosu diff --git a/Makefile b/Makefile index 317972dc..5ead9d52 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ SHELL := /bin/bash COMPOSE := docker compose APP := web +BEAT := celery_beat MANAGE := $(COMPOSE) run --rm $(APP) python manage.py .DEFAULT_GOAL := help @@ -32,6 +33,7 @@ help: @echo " Logs & status" @echo " ps Show running containers" @echo " health Verify DB, Redis, Selenium, and Celery containers" + @echo " notify Send Slack/Discord startup notification (celery_beat; optional DEPLOY_BRANCH)" @echo " logs Follow logs for all services" @echo " logs-web Follow logs for the web service" @echo " logs-worker Follow logs for the Celery worker" @@ -101,6 +103,10 @@ health: $(COMPOSE) ps --status running celery_worker | grep -q celery_worker $(COMPOSE) ps --status running celery_beat | grep -q celery_beat +.PHONY: notify +notify: + $(COMPOSE) exec -T -e DEPLOY_BRANCH="$(DEPLOY_BRANCH)" $(BEAT) python manage.py send_startup_notification + .PHONY: logs logs: $(COMPOSE) logs -f diff --git a/boost_library_docs_tracker/fetcher.py b/boost_library_docs_tracker/fetcher.py index 7902cb16..66c01869 100644 --- a/boost_library_docs_tracker/fetcher.py +++ b/boost_library_docs_tracker/fetcher.py @@ -76,9 +76,9 @@ def download_source_zip(version: str, dest_dir: Path) -> Path: zip_name = f"boost_{normalized.replace('.', '_')}.zip" zip_path = dest_dir / zip_name - # if zip_path.exists(): - # logger.info("Source zip already present, skipping download: %s", zip_path) - # return zip_path + if zip_path.exists(): + logger.info("Source zip already present, skipping download: %s", zip_path) + return zip_path dest_dir.mkdir(parents=True, exist_ok=True) session = _get_session() @@ -320,16 +320,26 @@ def crawl_library_pages( # Enqueue in-scope links soup = BeautifulSoup(resp.text, "lxml") - for a in soup.find_all("a", href=True): - href: str = a["href"] - abs_url = urljoin(final_url, href) - # Strip fragment - abs_url = abs_url.split("#")[0] - if ( - abs_url not in visited - and abs_url.startswith(start_url) - and abs_url not in queue - ): + lib_segment = lib_key.split("/")[-1] + if not lib_segment: + logger.warning( + "Empty library key segment for lib_key=%r; skipping link discovery for %s", + lib_key, + final_url, + ) + else: + for a in soup.find_all("a", href=True): + href: str = a["href"] + abs_url = urljoin(final_url, href) + # Strip fragment + abs_url = abs_url.split("#")[0] + if not abs_url.startswith(base_url): + continue + # Stay within this library's doc subtree (path contains lib segment) + if lib_segment not in abs_url: + continue + if abs_url in visited or abs_url in queue: + continue queue.append(abs_url) logger.debug( diff --git a/boost_library_docs_tracker/html_to_md.py b/boost_library_docs_tracker/html_to_md.py index 922c3fec..97862bc1 100644 --- a/boost_library_docs_tracker/html_to_md.py +++ b/boost_library_docs_tracker/html_to_md.py @@ -9,7 +9,7 @@ -------- 1. _preprocess_html – remove Boost boilerplate from HTML before pandoc sees it 2. _pandoc_convert – HTML → GFM via pypandoc (CLI fallback) -3. _postprocess_markdown – strip residual HTML artefacts and rejoin split lines +3. _postprocess_markdown – strip residual HTML artefacts, rejoin split lines, then clean_text (unicode/line endings only) """ import re @@ -17,6 +17,8 @@ from bs4 import BeautifulSoup +from core.utils.text_processing import clean_text + try: import pypandoc except Exception: # optional runtime dependency @@ -299,4 +301,7 @@ def _postprocess_markdown(md: str) -> str: # 12. Collapse excessive blank lines to at most two md = _RE_EXCESS_BLANK.sub("\n\n", md) - return md.strip() + "\n" + # 13. Unicode / line-ending cleanup (no space collapsing — preserves markdown indent) + md = clean_text(md, remove_extra_spaces=False) + + return md.rstrip() + "\n" diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index ac6ef248..2221aef3 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -11,10 +11,11 @@ 3. For each library, fetch docs and save to workspace: - Default (--use-local not set): HTTP BFS crawl per library. - --use-local: download source zip once per version, extract, walk local HTML. - Zip is saved in workspace/raw/boost_library_docs_tracker/ and is not deleted. + Zip is saved in workspace/raw/boost_library_docs_tracker/. Extract tree is saved in workspace/boost_library_docs_tracker/extracted/. Converted page content is saved in workspace/boost_library_docs_tracker/converted/. - Pass --cleanup-extract to delete the extract tree after all libraries are done. + Pass --cleanup-extract to delete the extract tree and the downloaded zip after + all libraries for that version are done. 4. Fill BoostDocContent and BoostLibraryDocumentation tables (no page_content in DB). - New content_hash → create new BoostDocContent row, set first_version and last_version. - Same content_hash but different URL → update url and scraped_at, update last_version. @@ -38,7 +39,6 @@ import logging from pathlib import Path -from django.apps import apps from django.core.management.base import BaseCommand, CommandError from boost_library_docs_tracker import fetcher, services, workspace @@ -47,8 +47,8 @@ logger = logging.getLogger(__name__) -APP_TYPE = "boost_library_docs" -PINECONE_NAMESPACE = "boost_library_docs" +APP_TYPE = "boost-library-documentation" +PINECONE_NAMESPACE = "boost-library-documentation" DEFAULT_MAX_PAGES = 10 @@ -103,8 +103,8 @@ def add_arguments(self, parser): "--cleanup-extract", action="store_true", help=( - "Delete the extracted source tree after all libraries for a version are " - "processed (only with --use-local)." + "Delete the extracted source tree and the raw zip under workspace/raw/ " + "after all libraries for a version are processed (only with --use-local)." ), ) @@ -180,9 +180,10 @@ def _process_version( self.stdout.write(f"[{version}] {len(library_list)} library/libraries.") source_root: Path | None = None + zip_path: Path | None = None if use_local: - source_root = self._prepare_local_source(version=version) + source_root, zip_path = self._prepare_local_source(version=version) # Resolve once per version; used to track first/last_version on BoostDocContent. boost_version_id = self._resolve_boost_version_id(version) @@ -202,20 +203,36 @@ def _process_version( if use_local and cleanup_extract and source_root is not None: fetcher.delete_extract_dir(source_root) + if zip_path is not None: + try: + zip_path.unlink(missing_ok=True) + self.stdout.write( + self.style.NOTICE( + f"[{version}] Removed source zip {zip_path.name}" + ) + ) + except OSError as exc: + logger.warning("Could not remove source zip %s: %s", zip_path, exc) + self.stdout.write( + self.style.WARNING( + f"[{version}] Could not remove source zip: {exc}" + ) + ) self.stdout.write(f"[{version}] Done — {total_pages} pages total.") - def _prepare_local_source(self, *, version: str) -> Path: + def _prepare_local_source(self, *, version: str) -> tuple[Path, Path]: """Download and extract the Boost source zip for a version. - Returns source_root — the top-level extracted directory. + Returns (source_root, zip_path): top-level extracted directory and path to + the zip under workspace/raw/boost_library_docs_tracker/. """ zip_dir = workspace.get_zip_dir() extract_dir = workspace.get_extract_dir() - if zip_dir.exists(): - self.stdout.write(f"[{version}] Source zip already exists at {zip_dir}") - return extract_dir + # if zip_dir.exists(): + # self.stdout.write(f"[{version}] Source zip already exists at {zip_dir}") + # return extract_dir try: zip_path = fetcher.download_source_zip(version, zip_dir) @@ -232,7 +249,7 @@ def _prepare_local_source(self, *, version: str) -> Path: ) from exc self.stdout.write(f"[{version}] Source ready at {source_root}") - return source_root + return source_root, zip_path def _process_library( self, @@ -308,7 +325,7 @@ def _process_library( def _save_pages_to_workspace_and_db( self, *, version, lib_name, lib_version_id, boost_version_id, pages ): - created = changed = unchanged = 0 + created = unchanged = 0 for url, page_text in pages: content_hash = hashlib.sha256(page_text.encode()).hexdigest() @@ -331,8 +348,6 @@ def _save_pages_to_workspace_and_db( if change_type == "created": created += 1 - elif change_type == "content_changed": - changed += 1 else: unchanged += 1 @@ -346,25 +361,14 @@ def _save_pages_to_workspace_and_db( exc, ) - self.stdout.write( - f" [{lib_name}] created={created}, changed={changed}, unchanged={unchanged}." - ) + self.stdout.write(f" [{lib_name}] created={created}, unchanged={unchanged}.") # ------------------------------------------------------------------ # Pinecone sync # ------------------------------------------------------------------ def _sync_pinecone(self): - if not apps.is_installed("cppa_pinecone_sync"): - self.stdout.write( - self.style.WARNING( - "Skipping Pinecone sync: 'cppa_pinecone_sync' is not in INSTALLED_APPS." - ) - ) - self.stdout.write( - "Add 'cppa_pinecone_sync' to INSTALLED_APPS or run with --skip-pinecone." - ) - return + """Sync to Pinecone""" try: from cppa_pinecone_sync.sync import sync_to_pinecone @@ -390,7 +394,6 @@ def _sync_pinecone(self): return successful_ids = result.get("successful_source_ids", []) - failed_ids = result.get("failed_ids", []) int_successful_ids: list[int] = [] for sid in successful_ids: try: @@ -477,27 +480,32 @@ def _get_library_list(self, version: str) -> list[tuple[Path, str]]: result.append((start_path, lib_key)) return result - def _resolve_library_version_id(self, lib_name: str, version: str) -> int | None: + def _resolve_library_version_id(self, lib_key: str, version: str) -> int | None: """Resolve BoostLibraryVersion id from DB. Returns None if not found.""" - try: - lv = BoostLibraryVersion.objects.select_related("library", "version").get( - library__name=lib_name, - version__version=version, - ) - return lv.pk - except BoostLibraryVersion.DoesNotExist: + lib_key = (lib_key or "").strip() + if not lib_key: return None - except BoostLibraryVersion.MultipleObjectsReturned: + + base_qs = BoostLibraryVersion.objects.select_related( + "library", "version" + ).filter(version__version=version) + # 1) Preferred: key + version + qs = base_qs.filter(key=lib_key) + lv = qs.first() + if lv: + return lv.pk + + # 2) Optional compatibility fallback: name + version + qs = base_qs.filter(library__name=lib_key) + lv = qs.first() + if lv: logger.warning( - "Multiple BoostLibraryVersion rows for lib=%s ver=%s; using first.", - lib_name, + "Resolved by library name fallback (missing/mismatched key): lib_key=%s, version=%s", + lib_key, version, ) - lv = BoostLibraryVersion.objects.filter( - library__name=lib_name, - version__version=version, - ).first() - return lv.pk if lv is not None else None + return lv.pk + return None def _resolve_boost_version_id(self, version: str) -> int | None: """Resolve BoostVersion PK from the version string. Returns None if not found.""" diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index 2b36c199..6389f105 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -4,8 +4,11 @@ Called by cppa_pinecone_sync.sync.sync_to_pinecone as the preprocess_fn argument. Signature matches the PreprocessFn contract: (failed_ids: list[str], final_sync_at: datetime | None) - -> tuple[list[dict], bool] - OR tuple[list[dict], bool, list[dict]] (with metadata updates) + -> tuple[list[dict], bool, list[dict]] + +The third list is metas_to_update: already-upserted rows whose scraped_at is +after final_sync_at (metadata refresh in Pinecone). Empty when final_sync_at +is None or nothing is stale. The failed_ids values come from failed_documents[*]["ids"] in the upsert result, which are BoostDocContent PKs encoded as strings. @@ -17,6 +20,9 @@ from datetime import datetime from typing import Any +from core.utils.boost_version_operations import encode_boost_version_string +from core.utils.text_processing import clean_text + from .models import BoostDocContent from . import workspace @@ -26,30 +32,33 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, -) -> tuple[list[dict[str, Any]], bool]: +) -> tuple[list[dict[str, Any]], bool, list[dict[str, Any]]]: """ - Build documents for Pinecone upsert from BoostDocContent records. + Build documents for Pinecone upsert and optional metadata updates. - Selects BoostDocContent records where is_upserted=False (not yet synced) - or whose PK is in failed_ids (retry after a previous failure). - final_sync_at is accepted for interface compatibility but is not used — - is_upserted is the authoritative sync state. + Upsert batch: BoostDocContent where is_upserted=False or PK is in failed_ids + (retry). Loads page text from the workspace for each row. - For each selected record: - - Resolves first_version / last_version from the FK fields on BoostDocContent. - - Loads page content from the workspace file. - - Returns source ids in metadata["ids"] so the caller can mark - BoostDocContent.is_upserted=True only after a successful Pinecone upsert. + Metadata batch (metas_to_update): when final_sync_at is set, rows with + is_upserted=True and scraped_at > final_sync_at (re-scraped after last sync), + excluding failed_ids. Same document shape as the upsert batch so + ingestion.update_documents can refresh metadata; doc_id remains content_hash. - Returns (documents, is_chunked=False). - doc_id in metadata is the content_hash of the BoostDocContent row. + When final_sync_at is None, metas_to_update is always [] (no incremental + stale-metadata pass). + + Returns (documents, is_chunked=False, metas_to_update). """ - records = _select_records(failed_ids, final_sync_at) - if not records: - return [], False + int_failed_ids = _parse_int_ids(failed_ids) + upsert_records = _select_upsert_records(int_failed_ids) + meta_records = _select_metadata_update_records(int_failed_ids, final_sync_at) - documents, _ids_to_mark = _build_documents(records) - return documents, False + if not upsert_records and not meta_records: + return [], False, [] + + documents, _ = _build_documents(upsert_records) + metas_to_update, _ = _build_documents(meta_records) + return documents, False, metas_to_update # --------------------------------------------------------------------------- @@ -57,19 +66,10 @@ def preprocess_for_pinecone( # --------------------------------------------------------------------------- -def _select_records( - failed_ids: list[str], - final_sync_at: datetime | None, -) -> list[BoostDocContent]: - """Return BoostDocContent records to process. - - Selects rows that are not yet upserted (is_upserted=False) or are in - failed_ids for retry. The final_sync_at parameter is accepted for interface - compatibility but is not used — is_upserted is the authoritative sync state. - """ +def _select_upsert_records(int_failed_ids: list[int]) -> list[BoostDocContent]: + """Rows to vector-upsert: not yet upserted or explicitly failed (retry).""" from django.db.models import Q - int_failed_ids = _parse_int_ids(failed_ids) query = Q(is_upserted=False) if int_failed_ids: query |= Q(pk__in=int_failed_ids) @@ -82,6 +82,27 @@ def _select_records( return list(qs) +def _select_metadata_update_records( + int_failed_ids: list[int], + final_sync_at: datetime | None, +) -> list[BoostDocContent]: + """Rows needing Pinecone metadata refresh only (already upserted, scraped since sync).""" + if final_sync_at is None: + return [] + + qs = ( + BoostDocContent.objects.filter( + is_upserted=True, + scraped_at__gt=final_sync_at, + ) + .select_related("first_version", "last_version") + .order_by("id") + ) + if int_failed_ids: + qs = qs.exclude(pk__in=int_failed_ids) + return list(qs) + + def _parse_int_ids(failed_ids: list[str]) -> list[int]: """Convert string IDs to ints, skipping malformed values. @@ -143,19 +164,27 @@ def _build_documents( ) continue + page_content = clean_text(page_content, remove_extra_spaces=False) + library_name = _get_library_name(doc_content) + metadata: dict[str, Any] = { + "doc_id": doc_content.content_hash, + "url": doc_content.url, + "library_name": library_name, + "source_ids": str(doc_content.pk), + } + fk = encode_boost_version_string(first_version_str) + if fk is not None: + metadata["first_version_key"] = fk + lk = encode_boost_version_string(last_version_str) + if lk is not None: + metadata["last_version_key"] = lk + documents.append( { "content": page_content, - "metadata": { - "doc_id": doc_content.content_hash, - "url": doc_content.url, - "first_version": first_version_str, - "last_version": last_version_str, - "library_name": library_name, - "ids": str(doc_content.pk), - }, + "metadata": metadata, } ) ids_to_mark.append(doc_content.pk) diff --git a/boost_library_docs_tracker/services.py b/boost_library_docs_tracker/services.py index b8e5e45e..830517e6 100644 --- a/boost_library_docs_tracker/services.py +++ b/boost_library_docs_tracker/services.py @@ -37,9 +37,10 @@ def get_or_create_doc_content( - On update: updates last_version to version_id. Returns (doc_content, change_type) where change_type is one of: - "created" — content_hash was not in DB; row inserted. - "content_changed" — URL exists for this hash but url field differs; url updated. - "unchanged" — content_hash already exists; only scraped_at and last_version updated. + "created" — content_hash was not in DB; row inserted. + "unchanged" — content_hash already existed; row may still be updated + (url, scraped_at, last_version / first_version as applicable). The document + body identity is the same hash, not a new page. Raises ValueError if url is empty. """ @@ -69,7 +70,6 @@ def get_or_create_doc_content( if obj.url != normalized_url: obj.url = normalized_url update_fields.append("url") - change_type = "content_changed" if version_id is not None: obj.last_version_id = version_id diff --git a/boost_library_docs_tracker/tests/test_preprocessor.py b/boost_library_docs_tracker/tests/test_preprocessor.py index 5fb482aa..18e013fb 100644 --- a/boost_library_docs_tracker/tests/test_preprocessor.py +++ b/boost_library_docs_tracker/tests/test_preprocessor.py @@ -1,13 +1,17 @@ """Tests for boost_library_docs_tracker.preprocessor.""" from datetime import timedelta +from unittest.mock import patch import pytest from django.utils import timezone from boost_library_docs_tracker import preprocessor, services +from boost_library_docs_tracker.models import BoostDocContent from boost_library_tracker import services as boost_library_services +_PAGE = "x" * 200 # long enough for downstream chunk validation if needed + @pytest.mark.django_db def test_get_library_name_uses_latest_relation( @@ -53,3 +57,83 @@ def test_get_library_name_uses_latest_relation( def test_get_library_name_returns_empty_without_relation(boost_doc_content): """_get_library_name returns an empty string when no relation exists.""" assert preprocessor._get_library_name(boost_doc_content) == "" + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_metas_when_upserted_and_scraped_after_final_sync( + _mock_load, + boost_doc_content, +): + """Stale upserted rows (scraped_at > final_sync_at) appear in metas_to_update only.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now, + ) + final_sync = now - timedelta(hours=1) + docs, chunked, metas = preprocessor.preprocess_for_pinecone([], final_sync) + assert docs == [] + assert chunked is False + assert len(metas) == 1 + assert metas[0]["metadata"]["doc_id"] == boost_doc_content.content_hash + assert metas[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_no_metas_when_final_sync_at_none(_mock_load, boost_doc_content): + """With final_sync_at None, metas_to_update is empty (no stale-metadata scan).""" + docs, chunked, metas = preprocessor.preprocess_for_pinecone([], None) + assert chunked is False + assert metas == [] + assert len(docs) == 1 + assert docs[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_metas_empty_when_scraped_before_final_sync( + _mock_load, + boost_doc_content, +): + """Upserted row scraped before final_sync_at is not in metas_to_update.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now - timedelta(hours=2), + ) + final_sync = now - timedelta(hours=1) + docs, _, metas = preprocessor.preprocess_for_pinecone([], final_sync) + assert docs == [] + assert metas == [] + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_meta_excludes_failed_ids(_mock_load, boost_doc_content): + """Rows in failed_ids are not selected for metadata-only update.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now, + ) + final_sync = now - timedelta(hours=1) + docs, _, metas = preprocessor.preprocess_for_pinecone( + [str(boost_doc_content.pk)], final_sync + ) + assert metas == [] + assert len(docs) == 1 + assert docs[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) diff --git a/boost_library_docs_tracker/tests/test_services.py b/boost_library_docs_tracker/tests/test_services.py index 1d47d917..7e4fb42e 100644 --- a/boost_library_docs_tracker/tests/test_services.py +++ b/boost_library_docs_tracker/tests/test_services.py @@ -41,8 +41,8 @@ def test_get_or_create_doc_content_unchanged_when_same_hash(): @pytest.mark.django_db -def test_get_or_create_doc_content_content_changed_when_url_differs(): - """get_or_create_doc_content returns 'content_changed' when url differs for same hash.""" +def test_get_or_create_doc_content_unchanged_when_url_differs_same_hash(): + """Same content_hash with a new URL still returns 'unchanged' (hash identity unchanged).""" services.get_or_create_doc_content( url="https://example.com/old-page", content_hash="c" * 64, @@ -51,7 +51,7 @@ def test_get_or_create_doc_content_content_changed_when_url_differs(): url="https://example.com/new-page", content_hash="c" * 64, ) - assert change_type == "content_changed" + assert change_type == "unchanged" obj2.refresh_from_db() assert obj2.url == "https://example.com/new-page" diff --git a/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py b/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py index c12e180d..fcc95101 100644 --- a/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py +++ b/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py @@ -121,8 +121,6 @@ def _push_markdown_to_github( all_new_files: dict[str, str], ) -> None: """Upload generated Markdown to BOOST_LIBRARY_TRACKER_REPO_*; unlink locals on success.""" - if not all_new_files: - return cfg = _markdown_export_repo_config() if not cfg: logger.error( @@ -504,17 +502,7 @@ def handle(self, *args, **options): if not skip_remote_push: logger.info("push Markdown to configured GitHub repo") - if not all_new_files: - if skip_markdown_export and not skip_github_sync: - logger.warning( - "nothing new to push (--skip-markdown-export); skipping remote push" - ) - elif skip_github_sync: - logger.warning( - "nothing to push from this run (sync was skipped)" - ) - else: - _push_markdown_to_github(md_output_dir, all_new_files) + _push_markdown_to_github(md_output_dir, all_new_files) else: logger.info("skipping remote push (--skip-remote-push)") diff --git a/boost_library_tracker/release_check.py b/boost_library_tracker/release_check.py index b3fed25d..7b9ec3df 100644 --- a/boost_library_tracker/release_check.py +++ b/boost_library_tracker/release_check.py @@ -12,7 +12,8 @@ """ import logging -import re + +from core.utils.boost_version_operations import parse_stable_boost_release_tag from boost_library_tracker.models import BoostVersion from github_ops.client import GitHubAPIClient @@ -23,29 +24,9 @@ MAIN_OWNER = "boostorg" MAIN_REPO = "boost" -# Only boost-X.Y.Z (three numeric parts, no suffix like -beta, -rc, etc.) -BOOST_TAG_PATTERN = re.compile(r"^boost-(\d+)\.(\d+)\.(\d+)$") MIN_BOOST_VERSION = (1, 16, 1) -def _parse_stable_version(tag_name: str) -> str | None: - """ - If ``tag_name`` is a stable release tag ``boost-X.Y.Z`` with version >= MIN_BOOST_VERSION, - return the canonical tag string (e.g. ``boost-1.90.0``). - - Return ``None`` for non-matching names, pre-release-style tags, or versions below the minimum. - """ - if not tag_name: - return None - m = BOOST_TAG_PATTERN.match(tag_name.strip()) - if not m: - return None - major, minor, patch = int(m.group(1)), int(m.group(2)), int(m.group(3)) - if (major, minor, patch) < MIN_BOOST_VERSION: - return None - return f"boost-{major}.{minor}.{patch}" - - def all_boost_versions_from_api() -> list[tuple[str, str]] | None: """ List stable Boost release tags from GitHub (``/repos/boostorg/boost/tags``). @@ -76,7 +57,9 @@ def all_boost_versions_from_api() -> list[tuple[str, str]] | None: if not page_tags: break for tag in page_tags: - stable_tag = _parse_stable_version(tag.get("name", "")) + stable_tag = parse_stable_boost_release_tag( + tag.get("name", ""), MIN_BOOST_VERSION + ) if not stable_tag: continue tag_commit = tag.get("commit") or {} diff --git a/boost_library_usage_dashboard/analyzer.py b/boost_library_usage_dashboard/analyzer.py index d9063ed9..0ffa3042 100644 --- a/boost_library_usage_dashboard/analyzer.py +++ b/boost_library_usage_dashboard/analyzer.py @@ -40,8 +40,7 @@ class BoostUsageDashboardAnalyzer: - def __init__(self, base_dir: Path, output_dir: Path): - self.base_dir = base_dir + def __init__(self, output_dir: Path): self.output_dir = output_dir self.dashboard_data_file = output_dir / "dashboard_data.json" self.report_file = output_dir / "Boost_Usage_Report_total.md" diff --git a/boost_library_usage_dashboard/management/__init__.py b/boost_library_usage_dashboard/management/__init__.py index e69de29b..a70886ec 100644 --- a/boost_library_usage_dashboard/management/__init__.py +++ b/boost_library_usage_dashboard/management/__init__.py @@ -0,0 +1 @@ +"""Django management package for boost_library_usage_dashboard.""" diff --git a/boost_library_usage_dashboard/management/commands/__init__.py b/boost_library_usage_dashboard/management/commands/__init__.py index e69de29b..edda1543 100644 --- a/boost_library_usage_dashboard/management/commands/__init__.py +++ b/boost_library_usage_dashboard/management/commands/__init__.py @@ -0,0 +1 @@ +"""Management commands for the boost_library_usage_dashboard app.""" diff --git a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py index b41144d8..0442e99a 100644 --- a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py +++ b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py @@ -1,172 +1,128 @@ +"""Build the Boost library usage dashboard from DB data and optionally publish to GitHub.""" + import logging -import shutil -from datetime import datetime -from pathlib import Path -from zoneinfo import ZoneInfo from django.conf import settings from django.core.management.base import BaseCommand, CommandError from boost_library_usage_dashboard.analyzer import BoostUsageDashboardAnalyzer +from boost_library_usage_dashboard.publisher import publish_dashboard from boost_library_usage_dashboard.renderer import render_dashboard_html from boost_library_usage_dashboard.report import write_summary_report from config.workspace import get_workspace_path -from github_ops.git_ops import clone_repo, pull, push logger = logging.getLogger(__name__) class Command(BaseCommand): + """Django management command: collect metrics, render HTML, optionally push to GitHub.""" + help = ( "Generate Boost library usage report/dashboard from PostgreSQL data, " - "then optionally publish generated files to a target GitHub repository." + "then publish generated files to a target GitHub repository unless skipped." ) def add_arguments(self, parser): + """Register skip flags and publish target overrides.""" + parser.add_argument( + "--skip-collect", + action="store_true", + help="Skip PostgreSQL collection and Markdown report generation.", + ) parser.add_argument( - "--publish", + "--skip-render", action="store_true", - help="Publish generated files to the repository configured in settings.", + help="Skip HTML rendering.", ) parser.add_argument( - "--target-branch", + "--skip-publish", + action="store_true", + help="Skip publishing to the configured GitHub repository.", + ) + parser.add_argument( + "--owner", type=str, - default="main", - help="Branch for pushing generated dashboard files.", + default="", + help="Publish repo owner (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER).", ) parser.add_argument( - "--output-dir", + "--repo", type=str, default="", - help="Custom output directory. Defaults to workspace/boost_library_usage_dashboard.", + help="Publish repo name (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO).", + ) + parser.add_argument( + "--branch", + type=str, + default="", + help="Branch to publish to (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH; default main).", ) def handle(self, *args, **options): - output_dir = ( - Path(options["output_dir"]).resolve() - if options["output_dir"] - else get_workspace_path("boost_library_usage_dashboard") - ) + """Run collect/render steps, then publish when configured and artifacts exist.""" + output_dir = get_workspace_path("boost_library_usage_dashboard").resolve() output_dir.mkdir(parents=True, exist_ok=True) - self.stdout.write("Step 1: Collecting dashboard data from PostgreSQL...") - analyzer = BoostUsageDashboardAnalyzer( - base_dir=settings.BASE_DIR, output_dir=output_dir - ) - stats = analyzer.run() + skip_collect = options["skip_collect"] + skip_render = options["skip_render"] + skip_publish = options["skip_publish"] - self.stdout.write("Step 2: Writing Markdown report...") - write_summary_report( - analyzer.report_file, - stats, - stars_min_threshold=analyzer.stars_min_threshold, - ) + if not skip_collect: + logger.info("Step 1: Collecting dashboard data from PostgreSQL...") + analyzer = BoostUsageDashboardAnalyzer(output_dir=output_dir) + stats = analyzer.run() + + logger.info("Step 2: Writing Markdown report...") + write_summary_report( + analyzer.report_file, + stats, + stars_min_threshold=analyzer.stars_min_threshold, + ) - self.stdout.write("Step 3: Rendering HTML files...") - render_dashboard_html(base_dir=settings.BASE_DIR, output_dir=output_dir) + if not skip_render: + logger.info("Step 3: Rendering HTML files...") + render_dashboard_html(base_dir=settings.BASE_DIR, output_dir=output_dir) - self.stdout.write( - self.style.SUCCESS(f"Dashboard artifacts generated at: {output_dir}") - ) + if not skip_collect or not skip_render: + logger.info("Dashboard artifacts at: %s", output_dir) - if options["publish"]: - owner = ( + if not skip_publish: + owner = (options["owner"] or "").strip() or ( getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "") or "" ).strip() - repo = ( + repo = (options["repo"] or "").strip() or ( getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", "") or "" ).strip() branch = ( - getattr( - settings, - "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", - "", + (options["branch"] or "").strip() + or ( + getattr( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", + "", + ) + or "" + ).strip() + or "main" + ) + + if not owner or not repo: + logger.warning( + "Skipping publish: set BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER " + "and BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO in settings, or pass " + "--owner and --repo." ) - or "" - ).strip() or options["target_branch"] - if owner and repo: - self._publish_via_raw_clone( + else: + if not any(output_dir.rglob("*.html")): + raise CommandError( + "Refusing to publish: no HTML artifacts were found in " + f"{output_dir}. Run without --skip-render first." + ) + publish_dashboard( output_dir=output_dir, owner=owner, repo=repo, branch=branch, ) - else: - raise CommandError( - "Cannot publish: BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER " - "and BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO must be set in settings." - ) - - def _publish_via_raw_clone( - self, - output_dir: Path, - owner: str, - repo: str, - branch: str, - ) -> None: - """ - Publish using persistent clone at raw/boost_library_usage_dashboard/owner/repo. - Clone if missing, pull, remove contents, copy output_dir, add/commit/push. - """ - clone_dir = ( - Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo - ) - clone_dir = clone_dir.resolve() - output_dir = output_dir.resolve() - if ( - clone_dir == output_dir - or clone_dir in output_dir.parents - or output_dir in clone_dir.parents - ): - raise CommandError( - "--output-dir must not overlap with the publish clone path: " - f"{clone_dir}" - ) - clone_dir.parent.mkdir(parents=True, exist_ok=True) - token = ( - getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN", None) - or None - ) - repo_slug = f"{owner}/{repo}" - self.stdout.write( - f"Publishing dashboard artifacts to {repo_slug} ({branch})..." - ) - if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): - if clone_dir.exists(): - shutil.rmtree(clone_dir) - self.stdout.write(f"Cloning {repo_slug} to {clone_dir}...") - clone_repo(repo_slug, clone_dir, token=token) - self.stdout.write("Pulling latest...") - pull(clone_dir, branch=branch, token=token) - for child in clone_dir.iterdir(): - if child.name == ".git": - continue - if child.is_dir() and child.name == "develop": - shutil.rmtree(child) - publish_subdir = clone_dir / "develop" - publish_subdir.mkdir(parents=True, exist_ok=True) - for child in output_dir.iterdir(): - dest = publish_subdir / child.name - if child.is_dir(): - shutil.copytree(child, dest) - else: - if child.suffix != ".html": - continue - shutil.copy2(child, dest) - tz_name = getattr(settings, "CELERY_TIMEZONE", None) or settings.TIME_ZONE - commit_time = datetime.now(ZoneInfo(tz_name)).strftime("%Y-%m-%d %H:%M:%S") - commit_message = ( - f"Update Boost library usage dashboard artifacts ({commit_time})" - ) - push( - clone_dir, - remote="origin", - branch=branch, - commit_message=commit_message, - token=token, - ) - self.stdout.write( - self.style.SUCCESS("Dashboard artifacts published successfully.") - ) diff --git a/boost_library_usage_dashboard/publisher.py b/boost_library_usage_dashboard/publisher.py new file mode 100644 index 00000000..b09a18d6 --- /dev/null +++ b/boost_library_usage_dashboard/publisher.py @@ -0,0 +1,127 @@ +"""Publish Boost library usage dashboard artifacts to a GitHub repository.""" + +from __future__ import annotations + +import logging +import re +import shutil +from datetime import datetime, timezone +from pathlib import Path + +from django.conf import settings +from django.core.management.base import CommandError + +from github_ops.git_ops import clone_repo, prepare_repo_for_pull, pull, push + +logger = logging.getLogger(__name__) + +# GitHub owner/login and repository name: single path segment, no traversal. +_GITHUB_OWNER_REPO_SLUG = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?$") + + +def _validate_github_slug(label: str, value: str) -> str: + """Return stripped owner or repo name, or raise CommandError if unsafe or invalid.""" + v = (value or "").strip() + if not v: + raise CommandError(f"Invalid GitHub {label}: empty") + if v in (".", ".."): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if "/" in v or "\\" in v: + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if Path(v).is_absolute(): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if not _GITHUB_OWNER_REPO_SLUG.fullmatch(v): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + return v + + +def publish_dashboard( + output_dir: Path, + owner: str, + repo: str, + branch: str, +) -> None: + """ + Publish using a persistent clone at raw/boost_library_usage_dashboard//. + Clone if missing, then fetch/clean/reset the clone, pull, sync ``develop/`` from + output_dir, commit, push. + + Uses ``settings.GITHUB_TOKEN_WRITE`` for clone/pull/push and + ``settings.GIT_AUTHOR_NAME`` / ``settings.GIT_AUTHOR_EMAIL`` for the commit + identity (via env vars on ``git commit`` only). + """ + owner = _validate_github_slug("owner", owner) + repo = _validate_github_slug("repo", repo) + + publish_root = (Path(settings.RAW_DIR) / "boost_library_usage_dashboard").resolve() + clone_dir = (publish_root / owner / repo).resolve() + try: + clone_dir.relative_to(publish_root) + except ValueError: + raise CommandError( + f"Publish clone path escapes dashboard publish root: {clone_dir}" + ) from None + + output_dir = output_dir.resolve() + if ( + clone_dir == output_dir + or clone_dir in output_dir.parents + or output_dir in clone_dir.parents + ): + raise CommandError( + "Workspace output directory must not overlap with the publish clone path: " + f"{clone_dir}" + ) + + clone_dir.parent.mkdir(parents=True, exist_ok=True) + token = (getattr(settings, "GITHUB_TOKEN_WRITE", None) or "").strip() or None + git_user_name = (getattr(settings, "GIT_AUTHOR_NAME", None) or "unknown").strip() + git_user_email = ( + getattr(settings, "GIT_AUTHOR_EMAIL", None) or "unknown@noreply.github.com" + ).strip() + + repo_slug = f"{owner}/{repo}" + logger.info("Publishing dashboard artifacts to %s (%s)...", repo_slug, branch) + + if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): + if clone_dir.exists(): + shutil.rmtree(clone_dir) + logger.info("Cloning %s to %s", repo_slug, clone_dir) + clone_repo(repo_slug, clone_dir, token=token) + + logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) + prepare_repo_for_pull(clone_dir, remote="origin", token=token) + + logger.info("Pulling latest for %s", clone_dir) + pull(clone_dir, branch=branch, token=token) + + for child in clone_dir.iterdir(): + if child.name == ".git": + continue + if child.is_dir() and child.name == "develop": + shutil.rmtree(child) + + publish_subdir = clone_dir / "develop" + publish_subdir.mkdir(parents=True, exist_ok=True) + + for child in output_dir.iterdir(): + dest = publish_subdir / child.name + if child.is_dir(): + shutil.copytree(child, dest) + else: + if child.suffix != ".html": + continue + shutil.copy2(child, dest) + + commit_time = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + commit_message = f"Update Boost library usage dashboard artifacts ({commit_time})" + push( + clone_dir, + remote="origin", + branch=branch, + commit_message=commit_message, + token=token, + git_user_name=git_user_name, + git_user_email=git_user_email, + ) + logger.info("Dashboard artifacts published successfully to %s.", repo_slug) diff --git a/boost_library_usage_dashboard/tests/fixtures.py b/boost_library_usage_dashboard/tests/fixtures.py index 29945417..66c0ac5c 100644 --- a/boost_library_usage_dashboard/tests/fixtures.py +++ b/boost_library_usage_dashboard/tests/fixtures.py @@ -5,4 +5,5 @@ @pytest.fixture def dashboard_cmd_name(): + """Name of the ``run_boost_library_usage_dashboard`` management command.""" return "run_boost_library_usage_dashboard" diff --git a/boost_library_usage_dashboard/tests/test_analyzer.py b/boost_library_usage_dashboard/tests/test_analyzer.py index f5c84618..2ab0f392 100644 --- a/boost_library_usage_dashboard/tests/test_analyzer.py +++ b/boost_library_usage_dashboard/tests/test_analyzer.py @@ -9,7 +9,6 @@ def _make_analyzer() -> BoostUsageDashboardAnalyzer: analyzer = BoostUsageDashboardAnalyzer.__new__(BoostUsageDashboardAnalyzer) - analyzer.base_dir = Path(tempfile.gettempdir()) / "boost-dashboard-test-base" analyzer.output_dir = Path(tempfile.gettempdir()) / "boost-dashboard-test-output" analyzer.version_name_list = ["1.50.0", "1.51.0", "1.52.0", "1.53.0", "1.54.0"] analyzer.repo_info = [] diff --git a/boost_library_usage_dashboard/tests/test_command.py b/boost_library_usage_dashboard/tests/test_command.py index a13c8b3f..7f91e11e 100644 --- a/boost_library_usage_dashboard/tests/test_command.py +++ b/boost_library_usage_dashboard/tests/test_command.py @@ -1,6 +1,5 @@ """Tests for run_boost_library_usage_dashboard command.""" -from io import StringIO from pathlib import Path from unittest.mock import MagicMock, patch @@ -12,35 +11,33 @@ @pytest.mark.django_db def test_dashboard_command_exists(dashboard_cmd_name): + """The dashboard management command is registered with Django.""" commands = get_commands() assert dashboard_cmd_name in commands @pytest.mark.django_db def test_dashboard_command_runs_generation_only(dashboard_cmd_name, tmp_path): + """Default collect+render runs; publish is skipped when ``--skip-publish`` is passed.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {"total_repositories": 0} fake_analyzer.report_file = tmp_path / "Boost_Usage_Report_total.md" fake_analyzer.stars_min_threshold = 10 - out = StringIO() - err = StringIO() - with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ) as analyzer_cls, patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.write_summary_report" ) as write_report, patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" - ) as render_html: - call_command( - dashboard_cmd_name, - "--output-dir", - str(tmp_path), - stdout=out, - stderr=err, - ) + ) as render_html, patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock: + call_command(dashboard_cmd_name, "--skip-publish") analyzer_cls.assert_called_once() fake_analyzer.run.assert_called_once() @@ -54,13 +51,14 @@ def test_dashboard_command_runs_generation_only(dashboard_cmd_name, tmp_path): base_dir=settings.BASE_DIR, output_dir=expected_output_dir, ) + publish_mock.assert_not_called() @pytest.mark.django_db -def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( +def test_dashboard_command_publish_with_owner_repo_calls_publish_dashboard( dashboard_cmd_name, tmp_path ): - """When --publish and settings have owner/repo, _publish_via_raw_clone is called.""" + """When owner/repo are set (settings or CLI), publish_dashboard is called.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "Boost_Usage_Report_total.md" @@ -68,6 +66,9 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( (tmp_path / "index.html").write_text("") with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -75,8 +76,8 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "myorg", @@ -91,15 +92,12 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( ): call_command( dashboard_cmd_name, - "--publish", - "--target-branch", + "--branch", "gh-pages", - "--output-dir", - str(tmp_path), ) - publish_raw_mock.assert_called_once() - call_kw = publish_raw_mock.call_args[1] + publish_mock.assert_called_once() + call_kw = publish_mock.call_args[1] assert call_kw["owner"] == "myorg" assert call_kw["repo"] == "my-repo" assert call_kw["branch"] == "gh-pages" @@ -110,13 +108,17 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( def test_dashboard_command_publish_uses_branch_from_settings_when_set( dashboard_cmd_name, tmp_path ): - """When BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH is set, it is passed to _publish_via_raw_clone.""" + """When BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH is set, it is used if --branch omitted.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "report.md" fake_analyzer.stars_min_threshold = 10 + (tmp_path / "index.html").write_text("") with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -124,8 +126,8 @@ def test_dashboard_command_publish_uses_branch_from_settings_when_set( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "org", @@ -138,29 +140,25 @@ def test_dashboard_command_publish_uses_branch_from_settings_when_set( "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", "publish-branch", ): - call_command( - dashboard_cmd_name, - "--publish", - "--target-branch", - "main", - "--output-dir", - str(tmp_path), - ) + call_command(dashboard_cmd_name) - assert publish_raw_mock.call_args[1]["branch"] == "publish-branch" + assert publish_mock.call_args[1]["branch"] == "publish-branch" @pytest.mark.django_db -def test_dashboard_command_publish_no_owner_repo_raises_command_error( +def test_dashboard_command_publish_no_owner_repo_skips_publish( dashboard_cmd_name, tmp_path ): - """When --publish but owner or repo missing in settings, CommandError is raised.""" + """When owner and repo are missing, publish is skipped (no CommandError).""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "report.md" fake_analyzer.stars_min_threshold = 10 with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -168,8 +166,8 @@ def test_dashboard_command_publish_no_owner_repo_raises_command_error( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "", @@ -178,12 +176,43 @@ def test_dashboard_command_publish_no_owner_repo_raises_command_error( "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", "", ): + call_command(dashboard_cmd_name) + + publish_mock.assert_not_called() + + +@pytest.mark.django_db +def test_dashboard_command_publish_refuses_without_html_artifacts( + dashboard_cmd_name, tmp_path +): + """Publish with owner/repo but no *.html under output_dir raises CommandError.""" + with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.write_summary_report" + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", + "org", + ), patch.object( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", + "repo", + ): + tmp_path.mkdir(parents=True, exist_ok=True) + (tmp_path / "dashboard_data.json").write_text("{}") with pytest.raises(CommandError) as exc_info: call_command( dashboard_cmd_name, - "--publish", - "--output-dir", - str(tmp_path), + "--skip-collect", + "--skip-render", ) - assert "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH" in str(exc_info.value) - publish_raw_mock.assert_not_called() + assert "no HTML artifacts" in str(exc_info.value) + publish_mock.assert_not_called() diff --git a/boost_library_usage_dashboard/tests/test_publisher.py b/boost_library_usage_dashboard/tests/test_publisher.py new file mode 100644 index 00000000..fa56512d --- /dev/null +++ b/boost_library_usage_dashboard/tests/test_publisher.py @@ -0,0 +1,54 @@ +"""Tests for boost_library_usage_dashboard.publisher validation.""" + +from unittest.mock import patch + +import pytest +from django.conf import settings +from django.core.management.base import CommandError + +from boost_library_usage_dashboard.publisher import publish_dashboard + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_owner_with_path_separator(tmp_path): + """Owner must be a single slug; path separators are rejected.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_dashboard( + tmp_path / "out", + owner="foo/bar", + repo="repo", + branch="main", + ) + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_dotdot_repo(tmp_path): + """Repo must not be path-like.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub repo"): + publish_dashboard( + tmp_path / "out", + owner="org", + repo="..", + branch="main", + ) + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_invalid_slug_chars(tmp_path): + """Spaces and other disallowed characters are rejected.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_dashboard( + tmp_path / "out", + owner="bad name", + repo="repo", + branch="main", + ) diff --git a/boost_library_usage_dashboard/utils.py b/boost_library_usage_dashboard/utils.py index e2c2fd85..8ce8167a 100644 --- a/boost_library_usage_dashboard/utils.py +++ b/boost_library_usage_dashboard/utils.py @@ -1,29 +1,19 @@ import re +from core.utils.boost_version_operations import ( + loose_version_tuple, + normalize_boost_version_string, +) + def _version_tuple(version: str) -> tuple[int, int, int]: """Parse version string (e.g. '1.84.0', 'release-2.1.9-extra') to (major, minor, patch) for sorting.""" - if not version: - return (0, 0, 0) - parts = version.strip().split(".") - out: list[int] = [] - for part in parts[:3]: - number = "".join(c for c in part if c.isdigit()) - out.append(int(number) if number else 0) - while len(out) < 3: - out.append(0) - return tuple(out[:3]) + return loose_version_tuple(version) def normalize_version_str(version_str: str) -> str | None: """Normalize a version string for comparison; returns None if invalid or pre-1.0.""" - version = (version_str or "").strip().replace("boost-", "") - version = version.replace("-", ".").replace("_", ".") - if not version or version.startswith("0."): - return None - if len(version.split(".")) == 2: - version = f"{version}.0" - return version + return normalize_boost_version_string(version_str) def format_percent(current: int, total: int) -> str: diff --git a/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py b/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py index a732eb3b..5784081c 100644 --- a/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py +++ b/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py @@ -61,7 +61,7 @@ def _run_pinecone_sync(app_type: str, namespace: str) -> None: "run_cppa_pinecone_sync", app_type=app_type, namespace=namespace, - preprocessor="boost_mailing_list_tracker.preprocesser.preprocess_mailing_list_for_pinecone", + preprocessor="boost_mailing_list_tracker.preprocessor.preprocess_mailing_list_for_pinecone", ) logger.info( "run_boost_mailing_list_tracker: pinecone sync completed (app_type=%s, namespace=%s)", diff --git a/boost_mailing_list_tracker/preprocesser.py b/boost_mailing_list_tracker/preprocessor.py similarity index 97% rename from boost_mailing_list_tracker/preprocesser.py rename to boost_mailing_list_tracker/preprocessor.py index 325ee9f5..6f7cbbf6 100644 --- a/boost_mailing_list_tracker/preprocesser.py +++ b/boost_mailing_list_tracker/preprocessor.py @@ -120,8 +120,7 @@ def preprocess_mailing_list_for_pinecone( "author": sender_name, "timestamp": safe_timestamp, "parent_id": message.parent_id or "", - # ids should reference DB row identity for sync bookkeeping. - "table_ids": message.pk, + "source_ids": str(message.pk), "list_name": message.list_name or "", } diff --git a/boost_mailing_list_tracker/tests/test_preprocesser.py b/boost_mailing_list_tracker/tests/test_preprocessor.py similarity index 90% rename from boost_mailing_list_tracker/tests/test_preprocesser.py rename to boost_mailing_list_tracker/tests/test_preprocessor.py index fb9c11ff..a12b6e13 100644 --- a/boost_mailing_list_tracker/tests/test_preprocesser.py +++ b/boost_mailing_list_tracker/tests/test_preprocessor.py @@ -1,17 +1,17 @@ -"""Tests for boost_mailing_list_tracker.preprocesser.""" +"""Tests for boost_mailing_list_tracker.preprocessor.""" from datetime import timedelta import pytest from django.utils import timezone -from boost_mailing_list_tracker.preprocesser import ( +from boost_mailing_list_tracker.preprocessor import ( preprocess_mailing_list_for_pinecone, ) @pytest.mark.django_db -def test_preprocesser_returns_empty_when_no_messages(): +def test_preprocessor_returns_empty_when_no_messages(): """No source rows -> empty docs and is_chunked=False.""" docs, is_chunked = preprocess_mailing_list_for_pinecone([], None) assert docs == [] @@ -19,7 +19,7 @@ def test_preprocesser_returns_empty_when_no_messages(): @pytest.mark.django_db -def test_preprocesser_first_sync_returns_all_messages( +def test_preprocessor_first_sync_returns_all_messages( mailing_list_profile, default_list_name, sample_sent_at, @@ -52,7 +52,7 @@ def test_preprocesser_first_sync_returns_all_messages( @pytest.mark.django_db -def test_preprocesser_incremental_by_created_at( +def test_preprocessor_incremental_by_created_at( mailing_list_profile, default_list_name, sample_sent_at, @@ -92,7 +92,7 @@ def test_preprocesser_incremental_by_created_at( @pytest.mark.django_db -def test_preprocesser_retries_failed_ids_even_if_old( +def test_preprocessor_retries_failed_ids_even_if_old( mailing_list_profile, default_list_name, sample_sent_at, @@ -120,11 +120,11 @@ def test_preprocesser_retries_failed_ids_even_if_old( ) assert len(docs) == 1 assert docs[0]["metadata"]["doc_id"] == "" - assert docs[0]["metadata"]["table_ids"] == retry_msg.pk + assert docs[0]["metadata"]["source_ids"] == str(retry_msg.pk) @pytest.mark.django_db -def test_preprocesser_deduplicates_overlap_between_failed_and_incremental( +def test_preprocessor_deduplicates_overlap_between_failed_and_incremental( mailing_list_profile, default_list_name, sample_sent_at, @@ -153,7 +153,7 @@ def test_preprocesser_deduplicates_overlap_between_failed_and_incremental( @pytest.mark.django_db -def test_preprocesser_document_shape_and_metadata_fields( +def test_preprocessor_document_shape_and_metadata_fields( mailing_list_profile, default_list_name, sample_sent_at, @@ -180,7 +180,7 @@ def test_preprocesser_document_shape_and_metadata_fields( assert target["content"] != "" assert "metadata" in target assert target["metadata"]["doc_id"] == "" - assert target["metadata"]["table_ids"] == msg.pk + assert target["metadata"]["source_ids"] == str(msg.pk) assert target["metadata"]["type"] == "mailing" assert target["metadata"]["thread_id"] == "thread-1" assert target["metadata"]["parent_id"] == "" @@ -189,6 +189,7 @@ def test_preprocesser_document_shape_and_metadata_fields( assert target["metadata"]["list_name"] == default_list_name assert target["metadata"]["timestamp"] == int(sample_sent_at.timestamp()) assert "ids" not in target["metadata"] + assert "source_ids" in target["metadata"] assert "msg_id" not in target["metadata"] assert "source" not in target["metadata"] assert "sender_id" not in target["metadata"] @@ -197,7 +198,7 @@ def test_preprocesser_document_shape_and_metadata_fields( @pytest.mark.django_db -def test_preprocesser_handles_empty_body_with_metadata_fallback_content( +def test_preprocessor_handles_empty_body_with_metadata_fallback_content( mailing_list_profile, default_list_name, sample_sent_at, diff --git a/boost_usage_tracker/boost_searcher.py b/boost_usage_tracker/boost_searcher.py index 457fa480..e9ec5602 100644 --- a/boost_usage_tracker/boost_searcher.py +++ b/boost_usage_tracker/boost_searcher.py @@ -23,6 +23,11 @@ from datetime import datetime from typing import Any, Optional +from core.utils.boost_version_operations import ( + decode_boost_version, + normalize_boost_version_string, +) + from github_ops.client import GitHubAPIClient logger = logging.getLogger(__name__) @@ -100,15 +105,6 @@ def extract_boost_includes(content: str) -> list[str]: ] -def _normalize_version(version_str: str) -> Optional[str]: - version = version_str.replace("-", ".").replace("_", ".") - if version.startswith("0."): - return None - if len(version.split(".")) == 2: - version = f"{version}.0" - return version - - def extract_boost_version_from_content( content: str, filename: str, @@ -122,28 +118,26 @@ def extract_boost_version_from_content( match = BOOST_VERSION_HPP_PATTERN.search(content) if match: ver_int = int(match.group(1)) - major = ver_int // 100_000 - minor = (ver_int // 100) % 1_000 - patch = ver_int % 100 + major, minor, patch = decode_boost_version(ver_int) return f"{major}.{minor}.{patch}" if lower in ("cmakelists.txt", "cmakelists.cmake"): for pat in CMAKE_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) if lower in ("conanfile.txt", "conanfile.py"): for pat in CONAN_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) if lower == "vcpkg.json": for pat in VCPKG_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) return None diff --git a/clang_github_tracker/__init__.py b/clang_github_tracker/__init__.py index 6337852f..10b94902 100644 --- a/clang_github_tracker/__init__.py +++ b/clang_github_tracker/__init__.py @@ -1 +1 @@ -"""Fetch GitHub activity for a configurable repo to raw JSON only (no DB).""" +"""Clang GitHub tracker: sync configured repo to raw JSON and tracker DB tables.""" diff --git a/clang_github_tracker/apps.py b/clang_github_tracker/apps.py new file mode 100644 index 00000000..2faffb35 --- /dev/null +++ b/clang_github_tracker/apps.py @@ -0,0 +1,11 @@ +"""Django app config for clang_github_tracker.""" + +from django.apps import AppConfig + + +class ClangGithubTrackerConfig(AppConfig): + """Registers the clang_github_tracker application.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "clang_github_tracker" + verbose_name = "Clang GitHub Tracker" diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py new file mode 100644 index 00000000..747ef656 --- /dev/null +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -0,0 +1,191 @@ +""" +Backfill ClangGithubIssueItem / ClangGithubCommit from raw JSON scan. +""" + +from __future__ import annotations + +import json +import logging +import re +from datetime import datetime + +from django.core.management.base import BaseCommand, CommandError + +from clang_github_tracker import services as clang_services +from clang_github_tracker.workspace import OWNER, REPO, get_raw_repo_dir +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, +) + +from core.utils.datetime_parsing import parse_iso_datetime as parse_datetime + +from clang_github_tracker.sync_raw import commit_date + +logger = logging.getLogger(__name__) + +_SHA40 = re.compile(r"^[0-9a-fA-F]{40}$") +_RAW_CHUNK_EVERY = 10_000 + + +class Command(BaseCommand): + """Load ``ClangGithubIssueItem`` / ``ClangGithubCommit`` from raw JSON dirs.""" + + help = ( + "Backfill clang_github_tracker DB by scanning " + "raw/github_activity_tracker///commits|issues|prs/*.json" + ) + + def handle(self, *args, **options): + """Scan raw JSON under the configured repo and upsert DB rows.""" + self._backfill_from_raw() + + def _backfill_from_raw(self) -> None: + """Scan ``commits`` / ``issues`` / ``prs`` JSON under the raw repo dir and upsert.""" + root = get_raw_repo_dir(OWNER, REPO, create=False) + if not root.is_dir(): + raise CommandError(f"Raw repo dir missing: {root}") + + commits_dir = root / "commits" + if commits_dir.is_dir(): + commit_rows: list[tuple[str, datetime | None]] = [] + c_skip = 0 + c_ins_total = c_upd_total = 0 + for c_read_n, p in enumerate(sorted(commits_dir.glob("*.json")), start=1): + try: + data = json.loads(p.read_text(encoding="utf-8")) + sha = (data.get("sha") or "").strip() + if not _SHA40.match(sha): + c_skip += 1 + continue + commit_rows.append((sha, commit_date(data))) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip commit file %s: %s", p, e) + c_skip += 1 + if c_read_n % _RAW_CHUNK_EVERY == 0: + if commit_rows: + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) + c_ins_total += ins_c + c_upd_total += upd_c + commit_rows.clear() + logger.info( + "raw commits/: read %s JSON files; cumulative " + "inserted=%s updated=%s skipped=%s", + c_read_n, + c_ins_total, + c_upd_total, + c_skip, + ) + if commit_rows: + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) + c_ins_total += ins_c + c_upd_total += upd_c + logger.info( + "raw commits/: done inserted=%s updated=%s skipped=%s", + c_ins_total, + c_upd_total, + c_skip, + ) + + issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] + i_ins_total = i_upd_total = 0 + + issues_dir = root / "issues" + if issues_dir.is_dir(): + i_skip = 0 + i_ok = 0 + for i_read_n, p in enumerate(sorted(issues_dir.glob("*.json")), start=1): + try: + data = json.loads(p.read_text(encoding="utf-8")) + flat = normalize_issue_json(data) + num = flat.get("number") + if not isinstance(num, int) or num <= 0: + i_skip += 1 + continue + issue_rows.append( + ( + num, + False, + parse_datetime(flat.get("created_at")), + parse_datetime(flat.get("updated_at")), + ) + ) + i_ok += 1 + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip issue file %s: %s", p, e) + i_skip += 1 + if i_read_n % _RAW_CHUNK_EVERY == 0: + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info( + "raw issues/: read %s JSON files; cumulative " + "issues+prs inserted=%s updated=%s", + i_read_n, + i_ins_total, + i_upd_total, + ) + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info("raw issues/: parsed_ok=%s skipped=%s", i_ok, i_skip) + + prs_dir = root / "prs" + if prs_dir.is_dir(): + pr_skip = 0 + pr_ok = 0 + for pr_read_n, p in enumerate(sorted(prs_dir.glob("*.json")), start=1): + try: + data = json.loads(p.read_text(encoding="utf-8")) + flat = normalize_pr_json(data) + num = flat.get("number") + if not isinstance(num, int) or num <= 0: + pr_skip += 1 + continue + issue_rows.append( + ( + num, + True, + parse_datetime(flat.get("created_at")), + parse_datetime(flat.get("updated_at")), + ) + ) + pr_ok += 1 + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip pr file %s: %s", p, e) + pr_skip += 1 + if pr_read_n % _RAW_CHUNK_EVERY == 0: + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info( + "raw prs/: read %s JSON files; cumulative " + "issues+prs inserted=%s updated=%s", + pr_read_n, + i_ins_total, + i_upd_total, + ) + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info("raw prs/: parsed_ok=%s skipped=%s", pr_ok, pr_skip) + + logger.info( + "raw issues+prs DB total: inserted=%s updated=%s", + i_ins_total, + i_upd_total, + ) + + logger.info("raw backfill finished root=%s", root) diff --git a/clang_github_tracker/management/commands/run_clang_github_tracker.py b/clang_github_tracker/management/commands/run_clang_github_tracker.py index b612fa3b..59507e6a 100644 --- a/clang_github_tracker/management/commands/run_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/run_clang_github_tracker.py @@ -1,38 +1,28 @@ """ Management command: run_clang_github_tracker -Fetches GitHub activity for llvm/llvm-project and saves only to -raw/github_activity_tracker/llvm/llvm-project (no DB writes). - -State (last commit/issue/PR dates) is stored in workspace/clang_github_activity/state.json. -If state is missing, it is created by scanning existing raw files or with nulls then scraping. - -After sync, updated issues/PRs are exported as Markdown and pushed to the private repo -configured via CLANG_GITHUB_TRACKER_PRIVATE_REPO_* settings. +Fetches GitHub activity for llvm/llvm-project, saves raw JSON and DB rows, optionally +exports Markdown and pushes to the configured Clang markdown GitHub repo. Resume uses DB watermarks (not state.json). """ import logging -import os -from datetime import datetime, timezone from pathlib import Path from django.conf import settings from django.core.management import call_command from django.core.management.base import BaseCommand, CommandError +from core.utils.datetime_parsing import parse_iso_datetime from clang_github_tracker import state_manager as clang_state -from clang_github_tracker.sync_raw import sync_raw_only +from clang_github_tracker.sync_raw import sync_clang_github_activity +from clang_github_tracker.publisher import publish_clang_markdown from clang_github_tracker.workspace import OWNER, REPO, get_workspace_root -from github_ops import get_github_token, upload_folder_to_github -from operations.md_ops.github_export import ( - detect_renames_from_dirs, - write_md_files, -) + +from operations.md_ops.github_export import write_md_files logger = logging.getLogger(__name__) -DEFAULT_PRIVATE_MD_BRANCH = "master" -PINECONE_NAMESPACE_ENV_KEY = "CLANG_GITHUB_PINECONE_NAMESPACE" +DEFAULT_CLANG_REPO_BRANCH = "master" def _run_pinecone_sync( @@ -40,12 +30,13 @@ def _run_pinecone_sync( ) -> None: """Trigger run_cppa_pinecone_sync if app_type and namespace are both set.""" if not app_type: - logger.warning("Pinecone sync skipped: --pinecone-app-type is empty.") + logger.warning( + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_APP_TYPE is empty (settings/env)." + ) return if not namespace: logger.warning( - "Pinecone sync skipped: namespace is empty (set --pinecone-namespace or %s).", - PINECONE_NAMESPACE_ENV_KEY, + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_NAMESPACE is empty (settings/env)." ) return try: @@ -68,350 +59,214 @@ def _run_pinecone_sync( class Command(BaseCommand): - """Django management command: fetch GitHub activity to raw and optionally run sync.""" + """Django management command: fetch GitHub activity to raw + DB; optional MD, push, Pinecone.""" help = ( "Run Clang GitHub Tracker: fetch llvm/llvm-project activity to " - "raw/github_activity_tracker only (no DB). Uses workspace/clang_github_activity/state.json for resume." + "raw/github_activity_tracker and DB. Uses DB cursor for resume (not state.json). " + "Use --skip-* to skip steps; default runs all." ) def add_arguments(self, parser): - """Register --dry-run, --from-date, --to-date, --no-upload, --pinecone-app-type, --pinecone-namespace.""" + """Define dry-run, skip flags, and optional ``--since`` / ``--until`` window.""" parser.add_argument( "--dry-run", action="store_true", - help="Only show resolved start/end dates and state; do not fetch.", + help="No sync, export, push, or Pinecone writes; resolved windows logged at INFO.", ) parser.add_argument( - "--from-date", - type=str, - default=None, - help="Start date for sync (ISO format: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS). Default: from state or raw scan.", + "--skip-github-sync", + action="store_true", + help="Skip API fetch / sync_clang_github_activity (raw JSON + DB upserts).", ) parser.add_argument( - "--to-date", - type=str, - default=None, - help="End date for sync (ISO format). Default: now.", + "--skip-markdown-export", + action="store_true", + help="Skip writing .md files from this run's sync results.", ) parser.add_argument( - "--no-upload", + "--skip-remote-push", action="store_true", - help="Generate Markdown files but skip pushing to GitHub (useful for inspection).", + help="Skip push to the repo configured via CLANG_GITHUB_CONTEXT_REPO_OWNER / CLANG_GITHUB_CONTEXT_REPO_NAME.", ) parser.add_argument( - "--upload-only", + "--skip-pinecone", action="store_true", - help="Only upload existing MD files from workspace (no sync, no MD generation).", + help="Skip run_cppa_pinecone_sync for issues and PRs.", ) parser.add_argument( - "--pinecone-app-type", + "--since", + "--from-date", + "--start-time", type=str, - default=settings.CLANG_GITHUB_PINECONE_APP_TYPE, - help="App type passed to run_cppa_pinecone_sync. Default from env CLANG_GITHUB_PINECONE_APP_TYPE.", + default=None, + dest="since", + help="Sync window start: YYYY-MM-DD or ISO-8601. " + "--from-date / --start-time are aliases for --since.", ) parser.add_argument( - "--pinecone-namespace", + "--until", + "--to-date", + "--end-time", type=str, - default=settings.CLANG_GITHUB_PINECONE_NAMESPACE, - help=f"Pinecone namespace for sync. Default from env {PINECONE_NAMESPACE_ENV_KEY}.", + default=None, + dest="until", + help="Sync window end: same formats as --since. " + "--to-date / --end-time are aliases for --until.", ) def handle(self, *args, **options): - """Resolve dates from state or CLI, then run sync unless --dry-run or --upload-only.""" + """Resolve sync window, then run GitHub fetch, Markdown, push, and Pinecone as configured.""" dry_run = options["dry_run"] - no_upload = options.get("no_upload", False) - upload_only = options.get("upload_only", False) - from_date_str = (options.get("from_date") or "").strip() - to_date_str = (options.get("to_date") or "").strip() - pinecone_app_type = ( - options.get("pinecone_app_type") or "" - ).strip() or settings.CLANG_GITHUB_PINECONE_APP_TYPE - pinecone_namespace = ( - options.get("pinecone_namespace") or "" - ).strip() or settings.CLANG_GITHUB_PINECONE_NAMESPACE - - if upload_only: - self._upload_md_only(dry_run=dry_run) - return - - from_date = None - to_date = None - if from_date_str: - try: - from_date = datetime.fromisoformat(from_date_str) - except ValueError as e: - logger.warning("Invalid --from-date: %s", e) - if to_date_str: - try: - to_date = datetime.fromisoformat(to_date_str) - except ValueError as e: - logger.warning("Invalid --to-date: %s", e) + skip_github_sync = options["skip_github_sync"] + skip_markdown_export = options["skip_markdown_export"] + skip_remote_push = options["skip_remote_push"] + skip_pinecone = options["skip_pinecone"] - # Normalize to UTC for comparison - if from_date and from_date.tzinfo is None: - from_date = from_date.replace(tzinfo=timezone.utc) - elif from_date: - from_date = from_date.astimezone(timezone.utc) - if to_date and to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) - elif to_date: - to_date = to_date.astimezone(timezone.utc) - - if from_date and to_date and from_date > to_date: - raise CommandError( - "Invalid date range: from_date must be before or equal to to_date." - ) - - resolved = clang_state.resolve_start_end_dates(from_date, to_date) - if resolved is None: - return + try: + since = parse_iso_datetime(options.get("since")) + until = parse_iso_datetime(options.get("until")) + except ValueError as e: + raise CommandError(str(e)) from e - start_commit, start_issue, start_pr, end_date = resolved + start_commit, start_item, end_date = clang_state.resolve_start_end_dates( + since, until + ) logger.info( - "Resolved: start_commit=%r start_issue=%r start_pr=%r end=%r", + "Resolved: start_commit=%r start_item=%r end=%r", start_commit, - start_issue, - start_pr, + start_item, end_date, ) - if dry_run: - logger.info("Dry run: no fetch performed.") - return - try: - commits_saved, issue_numbers, pr_numbers = sync_raw_only( - start_commit=start_commit, - start_issue=start_issue, - start_pr=start_pr, - end_date=end_date, - ) - logger.info( - "run_clang_github_tracker: saved commits=%s issues=%s prs=%s", - commits_saved, - len(issue_numbers), - len(pr_numbers), - ) - except Exception as e: - logger.exception("run_clang_github_tracker failed: %s", e) - raise + # Dry run - if not issue_numbers and not pr_numbers: - logger.info( - "run_clang_github_tracker: no issues/PRs synced; skipping MD export." - ) + if dry_run: + if not skip_github_sync: + logger.info("dry-run: would run GitHub sync for llvm/llvm-project") + else: + logger.info("dry-run: skipping GitHub sync (--skip-github-sync)") + if not skip_markdown_export: + logger.info("dry-run: would export Markdown for issues/PRs from sync") + if not skip_remote_push: + logger.info("dry-run: would push Markdown to configured Clang repo") + if not skip_pinecone: + logger.info("dry-run: would run Pinecone upsert for issues and PRs") + logger.info("dry-run finished") return - md_output_dir = get_workspace_root() / "md_export" - md_output_dir.mkdir(parents=True, exist_ok=True) - self.stdout.write(f"Writing MD to {md_output_dir}") + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] - try: - new_files = write_md_files( - owner=OWNER, - repo=REPO, - issue_numbers=issue_numbers, - pr_numbers=pr_numbers, - output_dir=md_output_dir, - folder_prefix="", - ) - logger.info( - "run_clang_github_tracker: generated %s MD file(s).", - len(new_files), - ) + # GitHub sync - if not new_files: - logger.info( - "run_clang_github_tracker: no MD files generated; skipping upload." + if not skip_github_sync: + try: + commits_saved, issue_numbers, pr_numbers = sync_clang_github_activity( + start_commit=start_commit, + start_item=start_item, + end_date=end_date, ) - return - - if no_upload: logger.info( - "run_clang_github_tracker: --no-upload set; skipping GitHub push." + "run_clang_github_tracker: sync done; commits=%s issues=%s prs=%s", + commits_saved, + len(issue_numbers), + len(pr_numbers), ) - return + except Exception as e: + logger.exception("run_clang_github_tracker sync failed: %s", e) + raise + else: + logger.info("skipping GitHub sync (--skip-github-sync)") + + # Markdown export + + md_output_dir = get_workspace_root() / "md_export" + md_output_dir.mkdir(parents=True, exist_ok=True) - private_owner = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" - ).strip() - private_repo_name = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", "" - ).strip() - private_branch = ( - getattr( - settings, - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", - DEFAULT_PRIVATE_MD_BRANCH, + new_files: dict[str, str] = {} + if not skip_markdown_export: + if issue_numbers or pr_numbers: + logger.info("writing MD to %s", md_output_dir) + new_files = write_md_files( + owner=OWNER, + repo=REPO, + issue_numbers=issue_numbers, + pr_numbers=pr_numbers, + output_dir=md_output_dir, + folder_prefix="", ) - or DEFAULT_PRIVATE_MD_BRANCH - ).strip() - if not private_owner or not private_repo_name: - logger.error( - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER / CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME " - "not configured; skipping upload." + logger.info( + "run_clang_github_tracker: generated %s MD file(s).", + len(new_files), ) - return - - token = get_github_token(use="write") - delete_paths = detect_renames_from_dirs( - private_owner, - private_repo_name, - private_branch, - new_files, - token=token, - ) - for repo_rel in delete_paths: - stale_local = md_output_dir / repo_rel - if stale_local.exists(): - stale_local.unlink() - if delete_paths: + elif skip_github_sync: + logger.info("skipped Markdown export (no sync in this run)") + else: logger.info( - "run_clang_github_tracker: %s renamed file(s) to delete.", - len(delete_paths), + "run_clang_github_tracker: no issues/PRs synced; skipping MD export." ) + else: + logger.info("skipping Markdown export (--skip-markdown-export)") - result = upload_folder_to_github( - local_folder=md_output_dir, - owner=private_owner, - repo=private_repo_name, - commit_message="chore: update Clang issues/PRs markdown", - branch=private_branch, - delete_paths=delete_paths or None, - ) + # Remote push - if result.get("success"): - logger.info("run_clang_github_tracker: MD upload complete.") - for local_path in new_files.values(): - Path(local_path).unlink(missing_ok=True) - else: - msg = result.get("message") or "Upload failed" - logger.error("run_clang_github_tracker: MD upload failed: %s", msg) - raise CommandError(msg) - except Exception as e: - logger.exception("run_clang_github_tracker: MD export/upload failed: %s", e) - raise + if not skip_remote_push: + logger.info("push Markdown to configured GitHub repo") + self._push_markdown(md_output_dir, new_files) + else: + logger.info("skipping remote push (--skip-remote-push)") - # Phase: upsert issues and PRs to Pinecone - effective_app_type = ( - pinecone_app_type or settings.CLANG_GITHUB_PINECONE_APP_TYPE - ) - effective_namespace = ( - pinecone_namespace or settings.CLANG_GITHUB_PINECONE_NAMESPACE - ) - _run_pinecone_sync( - f"{effective_app_type}-issues", - effective_namespace, - "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", - ) - _run_pinecone_sync( - f"{effective_app_type}-prs", - effective_namespace, - "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", - ) + # Pinecone sync - def _upload_md_only(self, *, dry_run: bool = False): - """Upload existing MD files from workspace/clang_github_activity/md_export (no sync, no generation).""" - if dry_run: - logger.info( - "run_clang_github_tracker: --upload-only with --dry-run; skipping upload." - ) - return - md_output_dir = get_workspace_root() / "md_export" - if not md_output_dir.is_dir(): - self.stdout.write( - self.style.WARNING( - f"No md_export folder at {md_output_dir}; nothing to upload." + if not skip_pinecone: + app_type = (settings.CLANG_GITHUB_PINECONE_APP_TYPE or "").strip() + namespace = (settings.CLANG_GITHUB_PINECONE_NAMESPACE or "").strip() + if not app_type: + logger.warning( + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_APP_TYPE is empty (settings/env)." ) - ) - return - - new_files = {} - for root, _dirs, files in os.walk(md_output_dir): - for name in files: - if not name.endswith(".md"): - continue - path = Path(root) / name - repo_rel = path.relative_to(md_output_dir).as_posix() - new_files[repo_rel] = str(path) - - if not new_files: - self.stdout.write( - self.style.WARNING("No .md files in md_export; nothing to upload.") - ) - return + else: + _run_pinecone_sync( + f"{app_type}-issues", + namespace, + "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", + ) + _run_pinecone_sync( + f"{app_type}-prs", + namespace, + "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", + ) + else: + logger.info("skipping Pinecone (--skip-pinecone)") - self.stdout.write(f"Writing MD to {md_output_dir}") - self.stdout.write(f"Found {len(new_files)} .md file(s) to upload.") + logger.info("run_clang_github_tracker finished successfully") - private_owner = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" + def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None: + """Publish ``md_export`` to ``CLANG_GITHUB_CONTEXT_*`` and remove local run artifacts.""" + clang_github_context_repo_owner = getattr( + settings, "CLANG_GITHUB_CONTEXT_REPO_OWNER", "" ).strip() - private_repo_name = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", "" + clang_github_context_repo_name = getattr( + settings, "CLANG_GITHUB_CONTEXT_REPO_NAME", "" ).strip() - private_branch = ( - getattr( - settings, - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", - DEFAULT_PRIVATE_MD_BRANCH, - ) - or DEFAULT_PRIVATE_MD_BRANCH - ).strip() - - if not private_owner or not private_repo_name: + clang_github_context_repo_branch = ( + getattr(settings, "CLANG_GITHUB_CONTEXT_REPO_BRANCH", "") or "" + ).strip() or DEFAULT_CLANG_REPO_BRANCH + if not clang_github_context_repo_owner or not clang_github_context_repo_name: logger.error( - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER / CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME " - "not configured." - ) - self.stdout.write( - self.style.ERROR( - "Private repo not configured; set CLANG_GITHUB_TRACKER_PRIVATE_REPO_*." - ) + "CLANG_GITHUB_CONTEXT_REPO_OWNER / CLANG_GITHUB_CONTEXT_REPO_NAME " + "not configured; skipping Markdown push." ) return - try: - token = get_github_token(use="write") - delete_paths = detect_renames_from_dirs( - private_owner, - private_repo_name, - private_branch, - new_files, - token=token, - ) - for repo_rel in delete_paths: - stale_local = md_output_dir / repo_rel - if stale_local.exists(): - stale_local.unlink() - if delete_paths: - logger.info( - "run_clang_github_tracker: %s renamed file(s) to delete.", - len(delete_paths), - ) - - result = upload_folder_to_github( - local_folder=md_output_dir, - owner=private_owner, - repo=private_repo_name, - commit_message="chore: update Clang issues/PRs markdown", - branch=private_branch, - delete_paths=delete_paths or None, - ) - - if result.get("success"): - self.stdout.write(self.style.SUCCESS("MD upload complete.")) - logger.info("run_clang_github_tracker: MD upload complete.") - for local_path in new_files.values(): - Path(local_path).unlink(missing_ok=True) - else: - msg = result.get("message") or "Upload failed" - self.stdout.write(self.style.ERROR(f"Upload failed: {msg}")) - logger.error( - "run_clang_github_tracker: MD upload failed: %s", - msg, - ) - raise CommandError(msg) - except Exception as e: - logger.exception("run_clang_github_tracker: upload-only failed: %s", e) - raise + publish_clang_markdown( + md_output_dir, + clang_github_context_repo_owner, + clang_github_context_repo_name, + clang_github_context_repo_branch, + new_files, + ) + logger.info("run_clang_github_tracker: MD publish complete.") + for local_path in new_files.values(): + Path(local_path).unlink(missing_ok=True) diff --git a/clang_github_tracker/migrations/0001_initial.py b/clang_github_tracker/migrations/0001_initial.py new file mode 100644 index 00000000..75b38638 --- /dev/null +++ b/clang_github_tracker/migrations/0001_initial.py @@ -0,0 +1,78 @@ +# Generated manually for clang_github_tracker models + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ClangGithubCommit", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("sha", models.CharField(max_length=40, unique=True)), + ( + "github_committed_at", + models.DateTimeField(blank=True, db_index=True, null=True), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "db_table": "clang_github_tracker_commit", + }, + ), + migrations.CreateModel( + name="ClangGithubIssueItem", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("number", models.PositiveIntegerField(unique=True)), + ("is_pull_request", models.BooleanField(default=False)), + ( + "github_created_at", + models.DateTimeField(blank=True, null=True), + ), + ( + "github_updated_at", + models.DateTimeField( + blank=True, + db_index=True, + help_text="GitHub API updated_at; drives fetch watermarks.", + null=True, + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + db_index=True, + help_text="Last DB save; drives Pinecone incrementality vs final_sync_at.", + ), + ), + ], + options={ + "db_table": "clang_github_tracker_issue_item", + }, + ), + ] diff --git a/clang_github_tracker/migrations/__init__.py b/clang_github_tracker/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/clang_github_tracker/models.py b/clang_github_tracker/models.py new file mode 100644 index 00000000..031f9fe6 --- /dev/null +++ b/clang_github_tracker/models.py @@ -0,0 +1,44 @@ +"""Database models for clang_github_tracker (no FKs to other apps).""" + +from __future__ import annotations + +from django.db import models + + +class ClangGithubIssueItem(models.Model): + """One row per GitHub issue or PR number for the configured llvm repo.""" + + number = models.PositiveIntegerField(unique=True) + is_pull_request = models.BooleanField(default=False) + github_created_at = models.DateTimeField(null=True, blank=True) + github_updated_at = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text="GitHub API updated_at; drives fetch watermarks.", + ) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField( + auto_now=True, + db_index=True, + help_text="Last DB save; drives Pinecone incrementality vs final_sync_at.", + ) + + class Meta: + """Maps to ``clang_github_tracker_issue_item``.""" + + db_table = "clang_github_tracker_issue_item" + + +class ClangGithubCommit(models.Model): + """One row per commit SHA synced for the configured llvm repo.""" + + sha = models.CharField(max_length=40, unique=True) + github_committed_at = models.DateTimeField(null=True, blank=True, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + """Maps to ``clang_github_tracker_commit``.""" + + db_table = "clang_github_tracker_commit" diff --git a/clang_github_tracker/preprocessors/__init__.py b/clang_github_tracker/preprocessors/__init__.py index e69de29b..2645d38c 100644 --- a/clang_github_tracker/preprocessors/__init__.py +++ b/clang_github_tracker/preprocessors/__init__.py @@ -0,0 +1 @@ +"""Pinecone preprocessor modules for clang_github_tracker.""" diff --git a/clang_github_tracker/preprocessors/issue_preprocessor.py b/clang_github_tracker/preprocessors/issue_preprocessor.py index 966b26ad..aaedf213 100644 --- a/clang_github_tracker/preprocessors/issue_preprocessor.py +++ b/clang_github_tracker/preprocessors/issue_preprocessor.py @@ -1,24 +1,29 @@ """ Pinecone issue preprocessor for clang_github_tracker. -Wraps github_activity_tracker.preprocessors.github_preprocess.preprocess_issues -for the llvm/llvm-project repo (configured via CLANG_GITHUB_OWNER / CLANG_GITHUB_REPO). - -Usage (via run_cppa_pinecone_sync or run_clang_github_tracker): - app_type = APP_TYPE (default: "github-clang", override with CLANG_GITHUB_PINECONE_APP_TYPE env) - namespace = NAMESPACE ("github-clang") - preprocessor = clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone +Selects candidate issue numbers from DB (updated_at vs final_sync_at) plus failed_ids retries, +then builds documents from raw JSON via github_preprocess.build_issue_document. """ from __future__ import annotations +import json +import logging import os +import re from datetime import datetime from typing import Any from django.conf import settings +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from github_activity_tracker.preprocessors.github_preprocess import build_issue_document +from github_activity_tracker.workspace import get_raw_source_issue_path -from github_activity_tracker.preprocessors.github_preprocess import preprocess_issues +logger = logging.getLogger(__name__) + +_ISSUE_ID_SUFFIX = re.compile(r":issue:(\d+)$") NAMESPACE = "github-clang" APP_TYPE = os.getenv("CLANG_GITHUB_PINECONE_APP_TYPE", NAMESPACE) @@ -28,18 +33,42 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, ) -> tuple[list[dict[str, Any]], bool]: - """Preprocess clang GitHub issues for Pinecone upsert. - - Args: - failed_ids: Previously failed ids strings to retry. - final_sync_at: Last successful sync timestamp; None means first run. - - Returns: - (documents, is_chunked=False) - """ - return preprocess_issues( - settings.CLANG_GITHUB_OWNER, - settings.CLANG_GITHUB_REPO, - failed_ids, - final_sync_at, - ) + """Preprocess clang GitHub issues for Pinecone upsert.""" + owner = settings.CLANG_GITHUB_OWNER + repo = settings.CLANG_GITHUB_REPO + + if final_sync_at is None: + qs = ClangGithubIssueItem.objects.filter(is_pull_request=False).values_list( + "number", flat=True + ) + else: + fs = final_sync_at + if timezone.is_naive(fs): + fs = timezone.make_aware(fs, timezone.utc) + qs = ClangGithubIssueItem.objects.filter( + is_pull_request=False, updated_at__gt=fs + ).values_list("number", flat=True) + + numbers: set[int] = set(int(n) for n in qs) + + for fid in failed_ids: + m = _ISSUE_ID_SUFFIX.search(fid or "") + if m: + numbers.add(int(m.group(1))) + + documents: list[dict[str, Any]] = [] + for number in sorted(numbers): + path = get_raw_source_issue_path(owner, repo, number) + if not path.is_file(): + logger.debug("preprocess issue #%s: raw missing %s", number, path) + continue + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("preprocess issue #%s: read failed %s", number, e) + continue + doc = build_issue_document(path, data, repo) + if doc: + documents.append(doc) + + return documents, False diff --git a/clang_github_tracker/preprocessors/pr_preprocessor.py b/clang_github_tracker/preprocessors/pr_preprocessor.py index bd9f0c9f..06f6aabe 100644 --- a/clang_github_tracker/preprocessors/pr_preprocessor.py +++ b/clang_github_tracker/preprocessors/pr_preprocessor.py @@ -1,24 +1,29 @@ """ Pinecone PR preprocessor for clang_github_tracker. -Wraps github_activity_tracker.preprocessors.github_preprocess.preprocess_prs -for the llvm/llvm-project repo (configured via CLANG_GITHUB_OWNER / CLANG_GITHUB_REPO). - -Usage (via run_cppa_pinecone_sync or run_clang_github_tracker): - app_type = APP_TYPE (default: "github-clang", override with CLANG_GITHUB_PINECONE_APP_TYPE env) - namespace = NAMESPACE ("github-clang") - preprocessor = clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone +Selects candidate PR numbers from DB (updated_at vs final_sync_at) plus failed_ids retries, +then builds documents from raw JSON via github_preprocess.build_pr_document. """ from __future__ import annotations +import json +import logging import os +import re from datetime import datetime from typing import Any from django.conf import settings +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from github_activity_tracker.preprocessors.github_preprocess import build_pr_document +from github_activity_tracker.workspace import get_raw_source_pr_path -from github_activity_tracker.preprocessors.github_preprocess import preprocess_prs +logger = logging.getLogger(__name__) + +_PR_ID_SUFFIX = re.compile(r":pr:(\d+)$") NAMESPACE = "github-clang" APP_TYPE = os.getenv("CLANG_GITHUB_PINECONE_APP_TYPE", NAMESPACE) @@ -28,18 +33,42 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, ) -> tuple[list[dict[str, Any]], bool]: - """Preprocess clang GitHub pull requests for Pinecone upsert. - - Args: - failed_ids: Previously failed ids strings to retry. - final_sync_at: Last successful sync timestamp; None means first run. - - Returns: - (documents, is_chunked=False) - """ - return preprocess_prs( - settings.CLANG_GITHUB_OWNER, - settings.CLANG_GITHUB_REPO, - failed_ids, - final_sync_at, - ) + """Preprocess clang GitHub pull requests for Pinecone upsert.""" + owner = settings.CLANG_GITHUB_OWNER + repo = settings.CLANG_GITHUB_REPO + + if final_sync_at is None: + qs = ClangGithubIssueItem.objects.filter(is_pull_request=True).values_list( + "number", flat=True + ) + else: + fs = final_sync_at + if timezone.is_naive(fs): + fs = timezone.make_aware(fs, timezone.utc) + qs = ClangGithubIssueItem.objects.filter( + is_pull_request=True, updated_at__gt=fs + ).values_list("number", flat=True) + + numbers: set[int] = set(int(n) for n in qs) + + for fid in failed_ids: + m = _PR_ID_SUFFIX.search(fid or "") + if m: + numbers.add(int(m.group(1))) + + documents: list[dict[str, Any]] = [] + for number in sorted(numbers): + path = get_raw_source_pr_path(owner, repo, number) + if not path.is_file(): + logger.debug("preprocess pr #%s: raw missing %s", number, path) + continue + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("preprocess pr #%s: read failed %s", number, e) + continue + doc = build_pr_document(path, data, repo) + if doc: + documents.append(doc) + + return documents, False diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py new file mode 100644 index 00000000..36b35c59 --- /dev/null +++ b/clang_github_tracker/publisher.py @@ -0,0 +1,256 @@ +"""Publish Clang markdown export to GitHub via a persistent clone.""" + +from __future__ import annotations + +import logging +import re +import shutil +import subprocess +from pathlib import Path + +from django.conf import settings +from django.core.management.base import CommandError + +from github_ops.git_ops import ( + clone_repo, + prepare_repo_for_pull, + pull, + push as git_push, + sanitize_git_output, +) +from github_ops.tokens import get_github_token +from operations.md_ops.github_export import detect_stale_titled_paths + +logger = logging.getLogger(__name__) + +_GITHUB_OWNER_REPO_SLUG = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?$") + + +def _redacted_git_subprocess_error(e: subprocess.CalledProcessError) -> str: + """Stderr/stdout or fallback ``str(e)``, redacted for logs and ``CommandError`` text.""" + tail = ((e.stderr or "") + (e.stdout or "")).strip() + text = tail if tail else str(e) + return sanitize_git_output(text) + + +def _validate_github_slug(label: str, value: str) -> str: + """Return stripped owner or repo name, or raise CommandError if unsafe or invalid.""" + v = (value or "").strip() + if not v: + raise CommandError(f"Invalid GitHub {label}: empty") + if v in (".", ".."): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if "/" in v or "\\" in v: + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if Path(v).is_absolute(): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if not _GITHUB_OWNER_REPO_SLUG.fullmatch(v): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + return v + + +def _reset_hard_to_upstream(clone_dir: Path, remote: str, branch: str) -> None: + """Match origin/ after pull so unpushed local commits from a failed push are dropped.""" + ref = f"{remote}/{branch}" + try: + subprocess.run( + ["git", "-C", str(clone_dir), "reset", "--hard", ref], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + except subprocess.CalledProcessError as e: + err = _redacted_git_subprocess_error(e) + raise CommandError(f"Could not reset clone to {ref}: {err}") from e + + +def _md_repo_rel_map(md_output_dir: Path) -> dict[str, str]: + """Map repo-relative posix path → absolute path for each .md under md_output_dir.""" + md_output_dir = md_output_dir.resolve() + out: dict[str, str] = {} + for path in md_output_dir.rglob("*"): + if not path.is_file(): + continue + if ".git" in path.relative_to(md_output_dir).parts: + continue + if path.suffix.lower() != ".md": + continue + rel = path.relative_to(md_output_dir).as_posix() + out[rel] = str(path.resolve()) + return out + + +def _copy_md_tree(md_output_dir: Path, clone_dir: Path) -> None: + """Copy all files under md_output_dir into clone_dir (preserve relative paths).""" + md_output_dir = md_output_dir.resolve() + clone_dir = clone_dir.resolve() + for path in md_output_dir.rglob("*"): + if not path.is_file(): + continue + if ".git" in path.relative_to(md_output_dir).parts: + continue + rel = path.relative_to(md_output_dir) + dest = clone_dir / rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, dest) + + +def publish_clang_markdown( + md_output_dir: Path, + owner: str, + repo: str, + branch: str, + new_files: dict[str, str], +) -> None: + """ + Clone (if needed) at RAW_DIR/clang_github_tracker//, fetch/clean/pull, + align to origin/, remove stale titled .md in md_export and clone, overlay + md_export into the clone, commit and push. + + Stale paths under ``md_output_dir`` use ``new_files`` (this run's writes). Stale + paths in the clone are detected using all ``.md`` files currently on disk under + ``md_output_dir`` so the clone matches the export tree. + + Uses get_github_token(use=\"write\") and settings GIT_AUTHOR_* for the commit. + """ + owner = _validate_github_slug("owner", owner) + repo = _validate_github_slug("repo", repo) + + publish_root = (Path(settings.RAW_DIR) / "clang_github_tracker").resolve() + clone_dir = (publish_root / owner / repo).resolve() + try: + clone_dir.relative_to(publish_root) + except ValueError as e: + raise CommandError( + f"Publish clone path escapes clang publish root: {clone_dir}" + ) from e + + md_output_dir = md_output_dir.resolve() + if ( + clone_dir == md_output_dir + or clone_dir in md_output_dir.parents + or md_output_dir in clone_dir.parents + ): + raise CommandError( + "Markdown output directory must not overlap with the publish clone path: " + f"{clone_dir}" + ) + + # Private CLANG_GITHUB_CONTEXT_* repos need a PAT that can read them (clone/pull) + # and push; get_github_token("write") uses GITHUB_TOKEN_WRITE or GITHUB_TOKEN. + try: + token = get_github_token(use="write") + except ValueError as e: + raise CommandError(str(e)) from e + git_user_name = ( + getattr(settings, "GIT_AUTHOR_NAME", None) or "" + ).strip() or "unknown" + git_user_email = ( + getattr(settings, "GIT_AUTHOR_EMAIL", None) or "" + ).strip() or "unknown@noreply.github.com" + + repo_slug = f"{owner}/{repo}" + logger.info("Publishing Clang markdown to %s (%s)...", repo_slug, branch) + logger.info( + "Publish git operations use the write token (GITHUB_TOKEN_WRITE, else " + "GITHUB_TOKEN). For a private target repo, that PAT must be granted access " + "to %s.", + repo_slug, + ) + + clone_dir.parent.mkdir(parents=True, exist_ok=True) + if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): + if clone_dir.exists(): + shutil.rmtree(clone_dir) + logger.info("Cloning %s to %s", repo_slug, clone_dir) + try: + clone_repo(repo_slug, clone_dir, token=token) + except subprocess.CalledProcessError as e: + msg = _redacted_git_subprocess_error(e) + hint = ( + "Clone already uses get_github_token(use='write') (GITHUB_TOKEN_WRITE " + "or GITHUB_TOKEN). Verify CLANG_GITHUB_CONTEXT_REPO_OWNER / _NAME, " + "and that this PAT can access the repo: for a private repo use a " + "classic PAT with 'repo' scope or a fine-grained PAT with access to " + "that repository. GitHub often returns 'not found' when the token " + "lacks access." + ) + logger.error("clang_github_tracker publish: git clone failed: %s", msg) + raise CommandError( + f"Git clone failed for {repo_slug}: {msg}. {hint}" + ) from e + + logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) + try: + prepare_repo_for_pull(clone_dir, remote="origin", token=token) + except subprocess.CalledProcessError as e: + err = _redacted_git_subprocess_error(e) + logger.error( + "clang_github_tracker publish: prepare clone for pull failed " + "(clone_dir=%s, branch=%s): %s", + clone_dir, + branch, + err, + exc_info=e, + ) + raise CommandError(f"Failed to prepare clone for pull: {err}") from e + + logger.info("Pulling latest for %s", clone_dir) + try: + pull(clone_dir, branch=branch, token=token) + except subprocess.CalledProcessError as e: + err = _redacted_git_subprocess_error(e) + logger.error( + "clang_github_tracker publish: git pull failed (clone_dir=%s, branch=%s): %s", + clone_dir, + branch, + err, + exc_info=e, + ) + raise CommandError(f"Git pull failed: {err}") from e + + logger.info("Resetting clone to origin/%s (discard unpushed commits)", branch) + _reset_hard_to_upstream(clone_dir, "origin", branch) + + stale_md = detect_stale_titled_paths(md_output_dir, new_files) + + for rel in stale_md: + p = md_output_dir / rel + if p.is_file(): + p.unlink() + + md_repo_rel_map = _md_repo_rel_map(md_output_dir) + stale_clone = detect_stale_titled_paths(clone_dir, md_repo_rel_map) + + for rel in stale_clone: + p = clone_dir / rel + if p.is_file(): + p.unlink() + + all_stale = sorted(set(stale_md) | set(stale_clone)) + if all_stale: + logger.info( + "clang_github_tracker publish: removed %s stale titled file(s).", + len(all_stale), + ) + + _copy_md_tree(md_output_dir, clone_dir) + + try: + git_push( + clone_dir, + remote="origin", + branch=branch, + commit_message="chore: update Clang issues/PRs markdown", + token=token, + git_user_name=git_user_name, + git_user_email=git_user_email, + ) + except subprocess.CalledProcessError as e: + err = _redacted_git_subprocess_error(e) + logger.error("clang_github_tracker publish: git push failed: %s", err) + raise CommandError(f"Git push failed: {err}") from e + + logger.info("Clang markdown published successfully to %s.", repo_slug) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py new file mode 100644 index 00000000..9cee1a05 --- /dev/null +++ b/clang_github_tracker/services.py @@ -0,0 +1,300 @@ +"""DB upsert and watermark helpers for clang_github_tracker.""" + +from __future__ import annotations + +import logging +from collections.abc import Sequence +from datetime import datetime, timedelta +from typing import Optional + +from django.db.models import Max +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem +from core.utils.datetime_parsing import ensure_aware_utc + +logger = logging.getLogger(__name__) + +DEFAULT_UPSERT_BATCH_SIZE = 500 + + +def _invalid_issue_number(n: object) -> bool: + """True if ``n`` is not a positive ``int`` (rejects ``bool`` — it subclasses ``int``).""" + return isinstance(n, bool) or not isinstance(n, int) or n <= 0 + + +def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | None: + """Return the later of two datetimes; ``None`` is treated as missing (never wins over a value).""" + if current is None: + return incoming + if incoming is None: + return current + return max(current, incoming) + + +def _merge_issue_item_fields( + existing: ClangGithubIssueItem | None, + is_pull_request: bool, + github_created_at: datetime | None, + github_updated_at: datetime | None, +) -> tuple[bool, datetime | None, datetime | None]: + """Merge incoming issue/PR fields with a stored row (None / older incoming must not weaken state).""" + if existing is None: + return (is_pull_request, github_created_at, github_updated_at) + return ( + existing.is_pull_request or is_pull_request, + ( + github_created_at + if github_created_at is not None + else existing.github_created_at + ), + _max_dt(existing.github_updated_at, github_updated_at), + ) + + +def upsert_issue_item( + number: int, + *, + is_pull_request: bool, + github_created_at: datetime | None, + github_updated_at: datetime | None, +) -> tuple[ClangGithubIssueItem, bool]: + """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" + if _invalid_issue_number(number): + raise ValueError(f"issue number must be a positive integer, got {number!r}") + github_created_at = ensure_aware_utc(github_created_at) + github_updated_at = ensure_aware_utc(github_updated_at) + existing = ClangGithubIssueItem.objects.filter(number=number).first() + is_pr, gc, gu = _merge_issue_item_fields( + existing, + is_pull_request, + github_created_at, + github_updated_at, + ) + obj, created = ClangGithubIssueItem.objects.update_or_create( + number=number, + defaults={ + "is_pull_request": is_pr, + "github_created_at": gc, + "github_updated_at": gu, + }, + ) + logger.debug( + "clang issue item #%s %s (pr=%s)", + number, + "created" if created else "updated", + is_pull_request, + ) + return obj, created + + +def upsert_commit( + sha: str, + *, + github_committed_at: datetime | None, +) -> tuple[ClangGithubCommit, bool]: + """Create or update a ClangGithubCommit by ``sha``. Returns (instance, created).""" + sha_clean = (sha or "").strip().lower() + if len(sha_clean) != 40: + raise ValueError(f"commit sha must be 40 hex chars, got {sha_clean!r}") + github_committed_at = ensure_aware_utc(github_committed_at) + existing = ClangGithubCommit.objects.filter(sha=sha_clean).first() + merged_committed_at = _max_dt( + existing.github_committed_at if existing else None, + github_committed_at, + ) + obj, created = ClangGithubCommit.objects.update_or_create( + sha=sha_clean, + defaults={"github_committed_at": merged_committed_at}, + ) + logger.debug( + "clang commit %s %s", + sha_clean[:8], + "created" if created else "updated", + ) + return obj, created + + +def _flush_commits_chunk( + pairs: list[tuple[str, datetime | None]], +) -> tuple[int, int]: + """Write one chunk; returns (inserted_count, updated_count).""" + if not pairs: + return 0, 0 + shas = [s for s, _ in pairs] + existing_committed = { + row.sha: row.github_committed_at + for row in ClangGithubCommit.objects.filter(sha__in=shas).only( + "sha", "github_committed_at" + ) + } + existing = set(existing_committed.keys()) + now = timezone.now() + objs = [ + ClangGithubCommit( + sha=s, + github_committed_at=_max_dt( + ensure_aware_utc(existing_committed.get(s)), + ensure_aware_utc(dt), + ), + updated_at=now, + ) + for s, dt in pairs + ] + ClangGithubCommit.objects.bulk_create( + objs, + batch_size=len(objs), + update_conflicts=True, + unique_fields=["sha"], + update_fields=["github_committed_at", "updated_at"], + ) + inserted = sum(1 for s, _ in pairs if s not in existing) + updated = len(pairs) - inserted + return inserted, updated + + +def upsert_commits_batch( + rows: Sequence[tuple[str, datetime | None]], + *, + batch_size: int = DEFAULT_UPSERT_BATCH_SIZE, +) -> tuple[int, int]: + """Batch upsert commits by ``sha``. Skips rows whose sha is not 40 chars. + + Returns: + (inserted, updated) counts across all batches. + """ + if batch_size <= 0: + logger.warning( + "batch_size must be positive, using %s", DEFAULT_UPSERT_BATCH_SIZE + ) + batch_size = DEFAULT_UPSERT_BATCH_SIZE + if batch_size > len(rows): + logger.warning( + "batch_size is greater than the number of rows, using %s", + len(rows), + ) + batch_size = len(rows) + merged: dict[str, datetime | None] = {} + for sha, dt in rows: + s = (sha or "").strip().lower() + if len(s) != 40: + continue + dt_a = ensure_aware_utc(dt) + merged[s] = _max_dt(merged.get(s), dt_a) + inserted = updated = 0 + items = list(merged.items()) + for i in range(0, len(items), batch_size): + di, du = _flush_commits_chunk(items[i : i + batch_size]) + inserted += di + updated += du + return inserted, updated + + +def _flush_issue_items_chunk( + rows: list[tuple[int, bool, datetime | None, datetime | None]], +) -> tuple[int, int]: + """Bulk upsert one chunk of issue/PR rows; returns (inserted, updated).""" + if not rows: + return 0, 0 + nums = [n for n, _, _, _ in rows] + existing_by_num = { + obj.number: obj + for obj in ClangGithubIssueItem.objects.filter(number__in=nums).only( + "number", + "is_pull_request", + "github_created_at", + "github_updated_at", + ) + } + existing = set(existing_by_num.keys()) + now = timezone.now() + objs = [] + for n, is_pr, gc, gu in rows: + gc = ensure_aware_utc(gc) + gu = ensure_aware_utc(gu) + m_is_pr, m_gc, m_gu = _merge_issue_item_fields( + existing_by_num.get(n), is_pr, gc, gu + ) + objs.append( + ClangGithubIssueItem( + number=n, + is_pull_request=m_is_pr, + github_created_at=m_gc, + github_updated_at=m_gu, + updated_at=now, + ) + ) + ClangGithubIssueItem.objects.bulk_create( + objs, + batch_size=len(objs), + update_conflicts=True, + unique_fields=["number"], + update_fields=[ + "is_pull_request", + "github_created_at", + "github_updated_at", + "updated_at", + ], + ) + inserted = sum(1 for n, _, _, _ in rows if n not in existing) + updated = len(rows) - inserted + return inserted, updated + + +def upsert_issue_items_batch( + rows: Sequence[tuple[int, bool, datetime | None, datetime | None]], + *, + batch_size: int = DEFAULT_UPSERT_BATCH_SIZE, +) -> tuple[int, int]: + """Batch upsert issue/PR rows by ``number``. + + Duplicate ``number`` values merge: ``github_updated_at`` uses the latest + timestamp; ``github_created_at`` uses a later row's value when non-None, + otherwise keeps the prior value; ``is_pull_request`` is True if any row + marks the number as a PR. + + Returns: + (inserted, updated) counts across all batches. + """ + merged: dict[int, tuple[bool, datetime | None, datetime | None]] = {} + for num, is_pr, gc, gu in rows: + if _invalid_issue_number(num): + continue + gc = ensure_aware_utc(gc) + gu = ensure_aware_utc(gu) + prev = merged.get(num) + if prev is None: + merged[num] = (is_pr, gc, gu) + else: + prev_is_pr, prev_gc, prev_gu = prev + merged[num] = ( + prev_is_pr or is_pr, + gc if gc is not None else prev_gc, + _max_dt(prev_gu, gu), + ) + inserted = updated = 0 + items = [(n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items())] + for i in range(0, len(items), batch_size): + di, du = _flush_issue_items_chunk(items[i : i + batch_size]) + inserted += di + updated += du + return inserted, updated + + +def get_issue_item_watermark() -> Optional[datetime]: + """Max ``github_updated_at`` across issues and PRs (API fetch cursor base).""" + m = ClangGithubIssueItem.objects.aggregate(m=Max("github_updated_at"))["m"] + return m + + +def get_commit_watermark() -> Optional[datetime]: + """Max ``github_committed_at`` across commits (API fetch cursor base).""" + m = ClangGithubCommit.objects.aggregate(m=Max("github_committed_at"))["m"] + return m + + +def start_after_watermark(max_dt: datetime | None) -> datetime | None: + """Return ``max + 1ms`` for API fetch lower bound, or ``None`` if no watermark.""" + if max_dt is None: + return None + return max_dt + timedelta(milliseconds=1) diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index f84675d1..36e8ddf7 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -1,294 +1,86 @@ """ -State for clang_github_tracker: last sync dates per entity (commits, issues, PRs). +Date resolution for clang_github_tracker sync windows. -Stored in workspace/clang_github_activity/state.json. -When state file is missing, it can be computed by scanning raw/github_activity_tracker/llvm/llvm-project. +Uses DB watermarks on ClangGithubIssueItem / ClangGithubCommit (not state.json). """ from __future__ import annotations -import json import logging -from datetime import datetime, timezone, timedelta -from pathlib import Path +from datetime import datetime -from clang_github_tracker.workspace import get_state_path, get_raw_repo_dir - -logger = logging.getLogger(__name__) - -# Keys in state JSON -KEY_LAST_COMMIT_DATE = "last_commit_date" -KEY_LAST_ISSUE_DATE = "last_issue_date" -KEY_LAST_PR_DATE = "last_pr_date" +from django.utils import timezone +from clang_github_tracker.services import ( + get_commit_watermark, + get_issue_item_watermark, + start_after_watermark, +) -def parse_iso(s: str | None) -> datetime | None: - """Parse ISO datetime string; returns None if missing or invalid.""" - if not s or not isinstance(s, str) or not s.strip(): - return None - try: - return datetime.fromisoformat(s.strip().replace("Z", "+00:00")) - except (ValueError, TypeError): - return None +logger = logging.getLogger(__name__) -def _to_iso(dt: datetime | None) -> str | None: - """Return datetime as ISO string with Z suffix, or None.""" +def _aware_utc(dt: datetime | None) -> datetime | None: + """Normalize ``dt`` to timezone-aware UTC, or return ``None``.""" if dt is None: return None - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return dt.isoformat().replace("+00:00", "Z") - + if timezone.is_naive(dt): + return timezone.make_aware(dt, timezone.utc) + return dt.astimezone(timezone.utc) -def load_state() -> dict[str, str | None]: - """ - Load state from workspace/clang_github_activity/state.json. - Returns: - - {} (empty dict): file missing, invalid, read error, or loaded object is empty/all-None - → invalid; ensure_state_file_exists will recompute from raw. - - Dict with keys last_commit_date, last_issue_date, last_pr_date (values str or None): - valid state file (at least one non-None date); None means no previous sync for that entity. - """ - path = get_state_path() - if not path.exists(): - return {} - try: - data = json.loads(path.read_text(encoding="utf-8")) - if not isinstance(data, dict): - return {} - # Treat empty or all-None as invalid so ensure_state_file_exists can recompute from raw - if not any( - ( - data.get(KEY_LAST_COMMIT_DATE), - data.get(KEY_LAST_ISSUE_DATE), - data.get(KEY_LAST_PR_DATE), - ) - ): - return {} - return { - KEY_LAST_COMMIT_DATE: data.get(KEY_LAST_COMMIT_DATE), - KEY_LAST_ISSUE_DATE: data.get(KEY_LAST_ISSUE_DATE), - KEY_LAST_PR_DATE: data.get(KEY_LAST_PR_DATE), - } - except Exception as e: - logger.warning("Failed to load state from %s: %s", path, e) - return {} - - -def save_state( - last_commit_date: datetime | None = None, - last_issue_date: datetime | None = None, - last_pr_date: datetime | None = None, - *, - merge: bool = True, -) -> None: - """Write state to workspace/clang_github_activity/state.json. If merge=True, load existing and update only provided keys.""" - path = get_state_path() - path.parent.mkdir(parents=True, exist_ok=True) - if merge: - current = load_state() - if last_commit_date is not None: - current[KEY_LAST_COMMIT_DATE] = _to_iso(last_commit_date) - if last_issue_date is not None: - current[KEY_LAST_ISSUE_DATE] = _to_iso(last_issue_date) - if last_pr_date is not None: - current[KEY_LAST_PR_DATE] = _to_iso(last_pr_date) - data = current - else: - data = { - KEY_LAST_COMMIT_DATE: _to_iso(last_commit_date), - KEY_LAST_ISSUE_DATE: _to_iso(last_issue_date), - KEY_LAST_PR_DATE: _to_iso(last_pr_date), - } - path.write_text(json.dumps(data, indent=2), encoding="utf-8") - logger.debug("Saved state to %s", path) - - -def _latest_date_from_commit_json(path: Path) -> datetime | None: - """Read a commit JSON file and return the author/committer date, or None.""" - try: - data = json.loads(path.read_text(encoding="utf-8")) - commit = data.get("commit") or {} - author = commit.get("author") or commit.get("committer") or {} - date_str = author.get("date") - return parse_iso(date_str) - except Exception: - return None - - -def _latest_date_from_issue_or_pr_json(path: Path) -> datetime | None: - """Read an issue or PR JSON file and return updated_at or created_at, or None.""" - try: - data = json.loads(path.read_text(encoding="utf-8")) - # Top-level or nested under issue_info / pr_info - for obj in [data, data.get("issue_info"), data.get("pr_info")]: - if not isinstance(obj, dict): - continue - date_str = obj.get("updated_at") or obj.get("created_at") - dt = parse_iso(date_str) - if dt is not None: - return dt - return None - except Exception: - return None - - -def compute_state_from_raw() -> dict[str, str | None]: - """ - Scan raw/github_activity_tracker// for commits, issues, prs - and return state dict with last_commit_date, last_issue_date, last_pr_date (ISO or None). - If the raw folder does not exist, returns all Nones (caller can write state.json from this). +def resolve_start_end_dates( + since: datetime | None, + until: datetime | None, +) -> tuple[datetime | None, datetime | None, datetime | None]: """ - root = get_raw_repo_dir(create=False) - result: dict[str, str | None] = { - KEY_LAST_COMMIT_DATE: None, - KEY_LAST_ISSUE_DATE: None, - KEY_LAST_PR_DATE: None, - } - if not root.is_dir(): - return result + Build GitHub sync window: ``(start_commit, start_item, end_date)`` in UTC. - # Commits - commits_dir = root / "commits" - if commits_dir.is_dir(): - latest_commit: datetime | None = None - for p in commits_dir.glob("*.json"): - dt = _latest_date_from_commit_json(p) - if dt and (latest_commit is None or dt > latest_commit): - latest_commit = dt - result[KEY_LAST_COMMIT_DATE] = _to_iso(latest_commit) + ``start_item`` is the single lower bound for the unified issues+PRs ``/issues`` fetch; + ``start_commit`` is the lower bound for the commits stream. Missing bounds mean + “from beginning” for starts. Naive datetimes are treated as UTC. - # Issues - issues_dir = root / "issues" - if issues_dir.is_dir(): - latest_issue: datetime | None = None - for p in issues_dir.glob("*.json"): - dt = _latest_date_from_issue_or_pr_json(p) - if dt and (latest_issue is None or dt > latest_issue): - latest_issue = dt - result[KEY_LAST_ISSUE_DATE] = _to_iso(latest_issue) + **Closed window** — both ``since`` and ``until`` are set: - # PRs - prs_dir = root / "prs" - if prs_dir.is_dir(): - latest_pr: datetime | None = None - for p in prs_dir.glob("*.json"): - dt = _latest_date_from_issue_or_pr_json(p) - if dt and (latest_pr is None or dt > latest_pr): - latest_pr = dt - result[KEY_LAST_PR_DATE] = _to_iso(latest_pr) + - If ``since <= until``: return ``(since, since, until)`` (same lower bound for both + streams; explicit end). + - If ``since > until``: log a warning, discard both CLI bounds, then use the + **DB watermark** path below. ``end_date`` is ``None``. - return result + **Otherwise** (no ``since``, or only one side after the rules above): + - ``end_date`` is ``until`` when ``until`` was provided, else ``None``. A ``None`` + end means “through now” for callers; ``sync_clang_github_activity`` substitutes + ``timezone.now()`` before fetching. -def ensure_state_file_exists() -> dict[str, str | None]: + - **Starts:** If ``since`` is set (without a valid closed window): ``start_commit`` + and ``start_item`` are both ``since``. If ``since`` is not set: both are + ``Max(github_* timestamp) + 1 millisecond`` from the DB when a watermark exists, else + ``None`` (full history). Watermarks use ``Max(github_committed_at)`` and + ``Max(github_updated_at)`` on ``ClangGithubCommit`` / ``ClangGithubIssueItem``. """ - If state file does not exist, ensure state.json exists: - - If raw folder exists: compute state from raw and write state.json. - - If raw folder does not exist: write state.json with {last_commit_date: null, last_issue_date: null, last_pr_date: null}. - If state file exists, load and return. If the file content is not a valid object (empty or invalid JSON), retry once by recomputing from raw and overwriting. - - Returns: - - {} (empty dict): error (e.g. write failed after retry). Caller should log and finish. - - Dict with last_commit_date, last_issue_date, last_pr_date (str or None): state exists; use it (None = fetch from beginning). - """ - path = get_state_path() - if path.exists(): - state = load_state() - if state: - return state - # File exists but content is not a valid object (empty or invalid); retry once by recomputing from raw - logger.warning( - "state.json is empty or not a valid object; recomputing from raw once." - ) - computed = compute_state_from_raw() - try: - path.write_text(json.dumps(computed, indent=2), encoding="utf-8") - logger.info( - "Rewrote state file from raw: last_commit=%s last_issue=%s last_pr=%s", - computed.get(KEY_LAST_COMMIT_DATE), - computed.get(KEY_LAST_ISSUE_DATE), - computed.get(KEY_LAST_PR_DATE), + since_aware = _aware_utc(since) + until_aware = _aware_utc(until) + + if since_aware is not None and until_aware is not None: + if since_aware > until_aware: + logger.warning( + "invalid date range: since (%s) is after until (%s); " + "using DB cursors; end_date None (sync applies now if needed)", + since_aware, + until_aware, ) - return computed - except OSError as e: - logger.warning("Failed to rewrite state file %s: %s", path, e) - return {} - computed = compute_state_from_raw() - try: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(computed, indent=2), encoding="utf-8") - except OSError as e: - logger.warning( - "Failed to write state file %s: %s; proceeding with no state.", - path, - e, - ) - return {} - logger.info( - "Created state file from raw scan: last_commit=%s last_issue=%s last_pr=%s", - computed.get(KEY_LAST_COMMIT_DATE), - computed.get(KEY_LAST_ISSUE_DATE), - computed.get(KEY_LAST_PR_DATE), - ) - return computed - - -def resolve_start_end_dates( - from_date: datetime | None, - to_date: datetime | None, -) -> tuple[datetime | None, datetime | None, datetime | None, datetime] | None: - """ - Resolve start dates for commits, issues, PRs and end_date. + since_aware, until_aware = None, None + else: + return since_aware, since_aware, until_aware - - If from_date and to_date are both provided (CLI): use them for all three and for end. - - Else: ensure state file exists (create from raw scan if missing), then use state's - last_*_date + 1s as start per entity, and to_date or now as end. + end_date = until_aware - Returns: - (start_commit, start_issue, start_pr, end_date) when state is valid. - None when state is {} after one retry — error is logged; caller should finish. - """ - if from_date is not None and to_date is not None: - # CLI provided both: use for all - if from_date.tzinfo is None: - from_date = from_date.replace(tzinfo=timezone.utc) - if to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) - return from_date, from_date, from_date, to_date - - state = ensure_state_file_exists() - if not state: - logger.error( - "State unavailable (error reading state.json or raw folder). Cannot resolve dates; exiting." - ) - return None - now = datetime.now(timezone.utc) - - if to_date is None: - to_date = now - elif to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) - - def start_from_state(key: str) -> datetime | None: - s = state.get(key) - dt = parse_iso(s) if isinstance(s, str) else None - if dt is None: - return None - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return dt + timedelta(seconds=1) - - start_commit = ( - from_date if from_date is not None else start_from_state(KEY_LAST_COMMIT_DATE) - ) - start_issue = ( - from_date if from_date is not None else start_from_state(KEY_LAST_ISSUE_DATE) - ) - start_pr = ( - from_date if from_date is not None else start_from_state(KEY_LAST_PR_DATE) - ) + if since_aware is None: + item_wm = start_after_watermark(get_issue_item_watermark()) + commit_wm = start_after_watermark(get_commit_watermark()) + else: + item_wm = since_aware + commit_wm = since_aware - return start_commit, start_issue, start_pr, to_date + return commit_wm, item_wm, end_date diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 8a382806..f6df55ae 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -1,13 +1,18 @@ """ -Sync llvm/llvm-project to raw/github_activity_tracker only (no DB). +Sync llvm/llvm-project to raw/github_activity_tracker and clang_github_tracker DB. -Uses github_activity_tracker.fetcher and raw_source; does not call services or persist to DB. +Staging: JSON is written under workspace/github_activity_tracker/// +(commits|issues|prs). After a successful DB upsert and raw write, the staging file is +removed. On any processing error the staging file is left for the next run. +Pending staging files are processed before any API fetch. """ from __future__ import annotations +import json import logging from datetime import datetime, timezone +from pathlib import Path from typing import Optional from github_activity_tracker import fetcher @@ -16,10 +21,24 @@ save_issue_raw_source, save_pr_raw_source, ) +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, +) +from github_activity_tracker.workspace import ( + get_commit_json_path, + get_issue_json_path, + get_pr_json_path, + iter_existing_commit_jsons, + iter_existing_issue_jsons, + iter_existing_pr_jsons, +) + +from core.utils.datetime_parsing import parse_iso_datetime as parse_datetime from github_ops import get_github_client from github_ops.client import ConnectionException, RateLimitException -from clang_github_tracker import state_manager as clang_state +from clang_github_tracker import services as clang_services from clang_github_tracker.workspace import OWNER, REPO logger = logging.getLogger(__name__) @@ -34,123 +53,279 @@ def _ensure_utc(dt: datetime | None) -> datetime | None: return dt.astimezone(timezone.utc) -def _commit_date(commit_data: dict) -> datetime | None: +def _valid_positive_issue_number(n: object) -> bool: + """True for a positive issue/PR number; rejects ``bool`` (``type(n) is int``).""" + return type(n) is int and n > 0 + + +def commit_date(commit_data: dict) -> datetime | None: """Extract author/committer date from GitHub commit payload.""" commit = commit_data.get("commit") or {} author = commit.get("author") or commit.get("committer") or {} - date_str = author.get("date") + date_str = author.get("date") or "" if not date_str: return None - return clang_state.parse_iso(date_str) + return parse_datetime(date_str) -def _issue_date(issue_data: dict) -> datetime | None: - """Extract updated_at or created_at from GitHub issue payload. - Fetcher yields {issue_info: , comments: [...]}, so check nested first.""" - info = issue_data.get("issue_info") or issue_data - date_str = info.get("updated_at") or info.get("created_at") - if not date_str: - return None - return clang_state.parse_iso(date_str) +def _write_staging_json(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8") -def _pr_date(pr_data: dict) -> datetime | None: - """Extract updated_at or created_at from GitHub PR payload. - Fetcher yields {pr_info: , comments: [...], reviews: [...]}, so check nested first. +def _promote_commit_staging( + owner: str, repo: str, staging_path: Path, commit_data: dict +) -> bool: """ - info = pr_data.get("pr_info") or pr_data - date_str = info.get("updated_at") or info.get("created_at") - if not date_str: - return None - return clang_state.parse_iso(date_str) + Upsert commit to DB, write raw JSON, remove staging file. + + Returns True if fully successful. On failure the staging file is kept (except + when the payload cannot be processed — invalid sha — staging is removed). + """ + sha = commit_data.get("sha") + if not isinstance(sha, str) or not sha.strip(): + logger.warning( + "clang sync: drop staging commit (missing sha): %s", staging_path + ) + staging_path.unlink(missing_ok=True) + return False + committed_at = commit_date(commit_data) + try: + clang_services.upsert_commit( + str(sha).strip(), + github_committed_at=committed_at, + ) + except Exception as e: + logger.warning( + "clang sync: commit DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_commit_raw_source(owner, repo, commit_data) + except Exception: + logger.exception( + "clang sync: raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def _promote_issue_staging( + owner: str, repo: str, staging_path: Path, item: dict +) -> bool: + flat = normalize_issue_json(item) + num = flat.get("number") + if not _valid_positive_issue_number(num): + logger.warning( + "clang sync: drop staging issue (invalid number): %s", staging_path + ) + staging_path.unlink(missing_ok=True) + return False + try: + clang_services.upsert_issue_item( + num, + is_pull_request=False, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + except Exception as e: + logger.warning( + "clang sync: issue DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_issue_raw_source(owner, repo, item) + except Exception: + logger.exception( + "clang sync: issue raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def _promote_pr_staging(owner: str, repo: str, staging_path: Path, item: dict) -> bool: + flat = normalize_pr_json(item) + num = flat.get("number") + if not _valid_positive_issue_number(num): + logger.warning("clang sync: drop staging PR (invalid number): %s", staging_path) + staging_path.unlink(missing_ok=True) + return False + try: + clang_services.upsert_issue_item( + num, + is_pull_request=True, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + except Exception as e: + logger.warning( + "clang sync: PR DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_pr_raw_source(owner, repo, item) + except Exception: + logger.exception( + "clang sync: PR raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def process_pending_clang_staging( + owner: str, + repo: str, +) -> tuple[int, list[int], list[int]]: + """ + Process workspace/github_activity_tracker/// commits, issues, prs. + + Returns (commits_promoted, issue_numbers, pr_numbers) for successful promotions. + """ + commits_promoted = 0 + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] + for path in sorted(iter_existing_commit_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging commit %s", path) + continue + if not isinstance(data, dict): + continue + if _promote_commit_staging(owner, repo, path, data): + commits_promoted += 1 -def sync_raw_only( + for path in sorted(iter_existing_issue_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging issue %s", path) + continue + if not isinstance(data, dict): + continue + flat = normalize_issue_json(data) + num = flat.get("number") + if _promote_issue_staging( + owner, repo, path, data + ) and _valid_positive_issue_number(num): + issue_numbers.append(num) + + for path in sorted(iter_existing_pr_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging PR %s", path) + continue + if not isinstance(data, dict): + continue + flat = normalize_pr_json(data) + num = flat.get("number") + if _promote_pr_staging( + owner, repo, path, data + ) and _valid_positive_issue_number(num): + pr_numbers.append(num) + + return commits_promoted, issue_numbers, pr_numbers + + +def sync_clang_github_activity( start_commit: datetime | None = None, - start_issue: datetime | None = None, - start_pr: datetime | None = None, + start_item: datetime | None = None, end_date: Optional[datetime] = None, ) -> tuple[int, list[int], list[int]]: """ - Fetch llvm/llvm-project commits, issues, PRs from GitHub and save only to - raw/github_activity_tracker/llvm/llvm-project. No DB writes. + Fetch llvm/llvm-project commits, issues, PRs from GitHub and upsert DB rows. + + Staging JSON lives under ``workspace/github_activity_tracker///``; + after a successful DB upsert and raw write under + ``workspace/raw/github_activity_tracker/...``, staging files are removed. + Pending staging files are processed before any API fetch. Args: start_commit: Start date for commits (None = from beginning). - start_issue: Start date for issues (None = from beginning). - start_pr: Start date for PRs (None = from beginning). - end_date: End date for all (default: now). + start_item: Single lower bound for unified issues+PRs ``/issues`` fetch. + end_date: End date for all (default: None = sync through now). Returns: - (commits_saved, issue_numbers, pr_numbers) — commit count and lists of - issue/PR numbers saved during this run. + (commits_saved, issue_numbers, pr_numbers). """ - from django.utils import timezone as django_tz owner = OWNER repo = REPO - if end_date is None: - end_date = django_tz.now() end_date = _ensure_utc(end_date) start_commit = _ensure_utc(start_commit) - start_issue = _ensure_utc(start_issue) - start_pr = _ensure_utc(start_pr) + start_item = _ensure_utc(start_item) client = get_github_client(use="scraping") - commits_saved = 0 - issue_numbers: list[int] = [] - pr_numbers: list[int] = [] - latest_commit: datetime | None = None - latest_issue: datetime | None = None - latest_pr: datetime | None = None + pending_c, pending_i, pending_p = process_pending_clang_staging(owner, repo) + commits_saved = pending_c + issue_numbers: list[int] = list(pending_i) + pr_numbers: list[int] = list(pending_p) try: - # Commits for commit_data in fetcher.fetch_commits_from_github( client, owner, repo, start_commit, end_date ): sha = commit_data.get("sha") - if sha: - save_commit_raw_source(owner, repo, commit_data) + if not isinstance(sha, str) or not sha.strip(): + continue + sha_clean = sha.strip() + staging_path = get_commit_json_path(owner, repo, sha_clean) + _write_staging_json(staging_path, commit_data) + if _promote_commit_staging(owner, repo, staging_path, commit_data): commits_saved += 1 - dt = _commit_date(commit_data) - if dt and (latest_commit is None or dt > latest_commit): - latest_commit = dt - if latest_commit is not None: - clang_state.save_state(last_commit_date=latest_commit, merge=True) - - # Issues - for issue_data in fetcher.fetch_issues_from_github( - client, owner, repo, start_issue, end_date - ): - issue_number = issue_data.get("number") or ( - issue_data.get("issue_info") or {} - ).get("number") - if issue_number is not None: - save_issue_raw_source(owner, repo, issue_data) - issue_numbers.append(issue_number) - dt = _issue_date(issue_data) - if dt and (latest_issue is None or dt > latest_issue): - latest_issue = dt - if latest_issue is not None: - clang_state.save_state(last_issue_date=latest_issue, merge=True) - - # PRs - for pr_data in fetcher.fetch_pull_requests_from_github( - client, owner, repo, start_pr, end_date + + for item in fetcher.fetch_issues_and_prs_from_github( + client, owner, repo, start_item, end_date ): - pr_number = (pr_data.get("pr_info") or {}).get("number") or pr_data.get( - "number" - ) - if pr_number is not None: - save_pr_raw_source(owner, repo, pr_data) - pr_numbers.append(pr_number) - dt = _pr_date(pr_data) - if dt and (latest_pr is None or dt > latest_pr): - latest_pr = dt - if latest_pr is not None: - clang_state.save_state(last_pr_date=latest_pr, merge=True) + if "pr_info" in item: + pr_number = (item["pr_info"] or {}).get("number") + if pr_number is None: + continue + if isinstance(pr_number, str) and pr_number.isdigit(): + pr_number = int(pr_number) + if type(pr_number) is not int or pr_number <= 0: + continue + staging_path = get_pr_json_path(owner, repo, pr_number) + _write_staging_json(staging_path, item) + flat = normalize_pr_json(item) + num = flat.get("number") + if _promote_pr_staging(owner, repo, staging_path, item) and ( + _valid_positive_issue_number(num) + ): + pr_numbers.append(num) + else: + issue_number = (item.get("issue_info") or {}).get("number") or item.get( + "number" + ) + if issue_number is None: + continue + if isinstance(issue_number, str) and issue_number.isdigit(): + issue_number = int(issue_number) + if type(issue_number) is not int or issue_number <= 0: + continue + staging_path = get_issue_json_path(owner, repo, issue_number) + _write_staging_json(staging_path, item) + flat = normalize_issue_json(item) + num = flat.get("number") + if _promote_issue_staging(owner, repo, staging_path, item) and ( + _valid_positive_issue_number(num) + ): + issue_numbers.append(num) except (ConnectionException, RateLimitException) as e: logger.exception("clang_github_tracker sync failed: %s", e) diff --git a/clang_github_tracker/tests/test_backfill.py b/clang_github_tracker/tests/test_backfill.py new file mode 100644 index 00000000..d6167d27 --- /dev/null +++ b/clang_github_tracker/tests/test_backfill.py @@ -0,0 +1,62 @@ +"""Tests for backfill_clang_github_tracker.""" + +import json + +import pytest +from django.core.management import call_command + +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem +from clang_github_tracker.workspace import OWNER, REPO + + +@pytest.mark.django_db +def test_backfill_from_raw(tmp_path, monkeypatch): + root = tmp_path / "raw" / OWNER / REPO + (root / "issues").mkdir(parents=True) + (root / "prs").mkdir(parents=True) + (root / "commits").mkdir(parents=True) + (root / "issues" / "3.json").write_text( + json.dumps( + { + "issue_info": { + "number": 3, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + } + } + ), + encoding="utf-8", + ) + (root / "prs" / "4.json").write_text( + json.dumps( + { + "pr_info": { + "number": 4, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + } + } + ), + encoding="utf-8", + ) + sha = "b" * 40 + (root / "commits" / f"{sha}.json").write_text( + json.dumps( + { + "sha": sha, + "commit": { + "author": {"date": "2024-05-01T00:00:00Z"}, + }, + } + ), + encoding="utf-8", + ) + + monkeypatch.setattr( + "clang_github_tracker.management.commands.backfill_clang_github_tracker.get_raw_repo_dir", + lambda *a, **k: root, + ) + call_command("backfill_clang_github_tracker") + assert ClangGithubIssueItem.objects.filter(number=3, is_pull_request=False).exists() + assert ClangGithubIssueItem.objects.filter(number=4, is_pull_request=True).exists() + assert ClangGithubCommit.objects.filter(sha=sha).exists() diff --git a/clang_github_tracker/tests/test_commands.py b/clang_github_tracker/tests/test_commands.py index e6ab17c1..8431c65d 100644 --- a/clang_github_tracker/tests/test_commands.py +++ b/clang_github_tracker/tests/test_commands.py @@ -1,42 +1,57 @@ -"""Tests for clang_github_tracker management command (run_clang_github_tracker).""" +"""Tests for clang_github_tracker management commands.""" -import json import logging - -import pytest from io import StringIO from unittest.mock import patch +import pytest from django.core.management import call_command - -from config.workspace import get_workspace_path +from django.core.management.base import CommandError +from django.test import override_settings CMD_NAME = "run_clang_github_tracker" @pytest.mark.django_db -def test_run_clang_github_tracker_dry_run_creates_state_if_missing(caplog): - """With --dry-run and no state file, command creates state from raw scan and resolves dates.""" - workspace = get_workspace_path("clang_github_activity") - state_file = workspace / "state.json" - if state_file.exists(): - state_file.unlink() - with caplog.at_level(logging.INFO): - call_command(CMD_NAME, "--dry-run", stdout=StringIO(), stderr=StringIO()) - assert state_file.exists(), "State file should be created by command" - state = json.loads(state_file.read_text(encoding="utf-8")) - assert "last_commit_date" in state - assert "last_issue_date" in state - assert "last_pr_date" in state +def test_run_clang_github_tracker_dry_run_logs_resolved(caplog): + """Dry run resolves dates from DB and does not call sync.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" + ) as sync_mock: + with caplog.at_level(logging.INFO): + call_command(CMD_NAME, "--dry-run", stdout=StringIO(), stderr=StringIO()) + sync_mock.assert_not_called() assert any("Resolved:" in r.getMessage() for r in caplog.records) - assert any("Dry run" in r.getMessage() for r in caplog.records) + assert any("dry-run" in r.getMessage().lower() for r in caplog.records) @pytest.mark.django_db -def test_run_clang_github_tracker_dry_run_with_dates(caplog): - """With --from-date and --to-date and --dry-run, command does not call sync.""" +def test_run_clang_github_tracker_skip_sync(caplog): + """--skip-github-sync bypasses the GitHub sync step (not only under --dry-run).""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only" + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" + ) as sync_mock, patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + with caplog.at_level(logging.INFO): + call_command( + CMD_NAME, + "--skip-github-sync", + "--skip-pinecone", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + sync_mock.assert_not_called() + assert any("Resolved:" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_run_clang_github_tracker_since_until_aliases(caplog): + """--from-date/--to-date aliases parse like Boost.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" ) as sync_mock: with caplog.at_level(logging.INFO): call_command( @@ -47,27 +62,152 @@ def test_run_clang_github_tracker_dry_run_with_dates(caplog): stdout=StringIO(), stderr=StringIO(), ) - sync_mock.assert_not_called() - assert any("Resolved:" in r.getMessage() for r in caplog.records) + sync_mock.assert_not_called() + assert any("Resolved:" in r.getMessage() for r in caplog.records) @pytest.mark.django_db -def test_run_clang_github_tracker_calls_sync_raw_only_when_not_dry_run(caplog): - """Without --dry-run, command calls sync_raw_only with resolved dates.""" +def test_run_clang_github_tracker_calls_sync_clang_github_activity_when_not_dry_run( + caplog, +): + """Without --dry-run, command calls sync_clang_github_activity with start_item.""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only", - return_value=(0, [], []), # commits_saved, issue_numbers, pr_numbers (lists) + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", + return_value=(0, [], []), ) as sync_mock: with caplog.at_level(logging.INFO): call_command( CMD_NAME, - "--from-date=2024-01-01", - "--to-date=2024-01-02", + "--since=2024-01-01", + "--until=2024-01-02", stdout=StringIO(), stderr=StringIO(), ) - sync_mock.assert_called_once() - call_kw = sync_mock.call_args[1] - assert "start_commit" in call_kw - assert "end_date" in call_kw - assert any("saved commits=" in r.getMessage() for r in caplog.records) + sync_mock.assert_called_once() + call_kw = sync_mock.call_args[1] + assert "start_commit" in call_kw + assert "start_item" in call_kw + assert "end_date" in call_kw + assert "start_issue" not in call_kw + assert any("commits=" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_run_clang_github_tracker_skip_pinecone(caplog): + """--skip-pinecone does not call run_cppa_pinecone_sync.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", + return_value=(0, [1], []), + ): + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.call_command" + ) as cc: + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + call_command( + CMD_NAME, + "--since=2024-01-01", + "--until=2024-01-02", + "--skip-pinecone", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + pinecone_calls = [ + c for c in cc.call_args_list if c[0] and c[0][0] == "run_cppa_pinecone_sync" + ] + assert not pinecone_calls + + +@pytest.mark.django_db +@override_settings(CLANG_GITHUB_PINECONE_APP_TYPE="") +def test_run_clang_github_tracker_empty_pinecone_app_type_skips_sync(caplog): + """Empty CLANG_GITHUB_PINECONE_APP_TYPE must not call run_cppa_pinecone_sync with -issues/-prs.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", + return_value=(0, [1], []), + ): + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.call_command" + ) as cc: + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + with caplog.at_level(logging.WARNING): + call_command( + CMD_NAME, + "--since=2024-01-01", + "--until=2024-01-02", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + pinecone_calls = [ + c for c in cc.call_args_list if c[0] and c[0][0] == "run_cppa_pinecone_sync" + ] + assert not pinecone_calls + assert any( + "CLANG_GITHUB_PINECONE_APP_TYPE is empty" in r.getMessage() + for r in caplog.records + ) + + +@pytest.mark.django_db +@override_settings( + CLANG_GITHUB_CONTEXT_REPO_OWNER="myorg", + CLANG_GITHUB_CONTEXT_REPO_NAME="myrepo", + CLANG_GITHUB_CONTEXT_REPO_BRANCH="main", +) +def test_push_markdown_calls_publish_and_unlinks_new_files(tmp_path): + """_push_markdown invokes publish_clang_markdown then removes per-run md files.""" + md = tmp_path / "md_export" + md.mkdir() + f = md / "issues" / "2024" / "2024-01" + f.mkdir(parents=True) + one = f / "#1 - A.md" + one.write_text("x", encoding="utf-8") + new_files = {"issues/2024/2024-01/#1 - A.md": str(one)} + + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.publish_clang_markdown" + ) as pub: + from clang_github_tracker.management.commands.run_clang_github_tracker import ( + Command, + ) + + Command()._push_markdown(md, new_files) + + pub.assert_called_once_with(md, "myorg", "myrepo", "main", new_files) + assert not one.exists() + + +@pytest.mark.django_db +@override_settings( + CLANG_GITHUB_CONTEXT_REPO_OWNER="o", + CLANG_GITHUB_CONTEXT_REPO_NAME="r", + CLANG_GITHUB_CONTEXT_REPO_BRANCH="main", +) +def test_push_markdown_publish_failure_does_not_unlink(tmp_path): + """Failed publish leaves local md files in place.""" + md = tmp_path / "md_export" + md.mkdir() + one = md / "x.md" + one.write_text("keep", encoding="utf-8") + new_files = {"x.md": str(one)} + + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.publish_clang_markdown", + side_effect=CommandError("publish failed"), + ): + from clang_github_tracker.management.commands.run_clang_github_tracker import ( + Command, + ) + + with pytest.raises(CommandError, match="publish failed"): + Command()._push_markdown(md, new_files) + + assert one.exists() + assert one.read_text(encoding="utf-8") == "keep" diff --git a/clang_github_tracker/tests/test_preprocessors.py b/clang_github_tracker/tests/test_preprocessors.py new file mode 100644 index 00000000..8009237a --- /dev/null +++ b/clang_github_tracker/tests/test_preprocessors.py @@ -0,0 +1,92 @@ +"""Tests for DB-driven clang preprocessors.""" + +from datetime import timedelta +from unittest.mock import patch + +import pytest +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from clang_github_tracker.preprocessors import issue_preprocessor, pr_preprocessor + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.issue_preprocessor.build_issue_document") +def test_issue_preprocessor_db_and_failed_ids(mock_build, tmp_path, settings): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = { + "content": "body", + "metadata": {"doc_id": "u", "ids": "x"}, + } + + p10 = tmp_path / "10.json" + p10.write_text("{}", encoding="utf-8") + p99 = tmp_path / "99.json" + p99.write_text("{}", encoding="utf-8") + + ClangGithubIssueItem.objects.create( + number=10, + is_pull_request=False, + github_updated_at=timezone.now(), + ) + final = timezone.now() - timedelta(hours=1) + + def _issue_path(_owner, _repo, n): + return {10: p10, 99: p99}.get(n, tmp_path / f"missing_{n}.json") + + with patch( + "clang_github_tracker.preprocessors.issue_preprocessor.get_raw_source_issue_path", + side_effect=_issue_path, + ): + docs, chunked = issue_preprocessor.preprocess_for_pinecone( + ["llvm-project:issue:99"], final + ) + assert chunked is False + # DB watermark picks #10; failed_ids must parse llvm-project:issue:99 and add #99. + assert mock_build.call_count == 2 + assert len(docs) == 2 + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.issue_preprocessor.build_issue_document") +def test_issue_preprocessor_all_rows_when_final_sync_none( + mock_build, tmp_path, settings +): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = None + p5 = tmp_path / "5.json" + p5.write_text("{}", encoding="utf-8") + ClangGithubIssueItem.objects.create( + number=5, + is_pull_request=False, + github_updated_at=timezone.now(), + ) + with patch( + "clang_github_tracker.preprocessors.issue_preprocessor.get_raw_source_issue_path", + return_value=p5, + ): + docs, _ = issue_preprocessor.preprocess_for_pinecone([], None) + assert mock_build.call_count == 1 + assert docs == [] + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.pr_preprocessor.build_pr_document") +def test_pr_preprocessor_failed_id_parsing(mock_build, tmp_path, settings): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = {"content": "p", "metadata": {"doc_id": "u", "ids": "y"}} + p7 = tmp_path / "7.json" + p7.write_text("{}", encoding="utf-8") + with patch( + "clang_github_tracker.preprocessors.pr_preprocessor.get_raw_source_pr_path", + return_value=p7, + ): + docs, chunked = pr_preprocessor.preprocess_for_pinecone( + ["llvm-project:pr:7"], timezone.now() + ) + assert chunked is False + assert mock_build.call_count == 1 + assert len(docs) == 1 diff --git a/clang_github_tracker/tests/test_publisher.py b/clang_github_tracker/tests/test_publisher.py new file mode 100644 index 00000000..a4856a98 --- /dev/null +++ b/clang_github_tracker/tests/test_publisher.py @@ -0,0 +1,189 @@ +"""Tests for clang_github_tracker.publisher.publish_clang_markdown.""" + +import subprocess +from pathlib import Path +from unittest.mock import patch + +import pytest +from django.core.management.base import CommandError +from django.test import override_settings + +from clang_github_tracker.publisher import publish_clang_markdown + + +@pytest.fixture +def raw_and_md(tmp_path: Path): + raw = tmp_path / "raw" + raw.mkdir() + md = tmp_path / "md_export" + md.mkdir() + clone_root = raw / "clang_github_tracker" / "acme" / "priv" + clone_root.mkdir(parents=True) + (clone_root / ".git").mkdir() + return raw, md, clone_root + + +def _author_settings(raw: Path): + return override_settings( + RAW_DIR=raw, + GIT_AUTHOR_NAME="Test", + GIT_AUTHOR_EMAIL="test@example.com", + ) + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_success_copies_and_pushes( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + """Happy path: overlay md_export into clone and call git_push.""" + raw, md, clone_root = raw_and_md + sub = md / "issues" / "2024" / "2024-01" + sub.mkdir(parents=True) + f = sub / "#1 - Title.md" + f.write_text("body", encoding="utf-8") + new_files = {"issues/2024/2024-01/#1 - Title.md": str(f)} + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", new_files) + + copied = clone_root / "issues" / "2024" / "2024-01" / "#1 - Title.md" + assert copied.is_file() + assert copied.read_text(encoding="utf-8") == "body" + mock_push.assert_called_once() + kwargs = mock_push.call_args[1] + assert kwargs["branch"] == "main" + assert kwargs["commit_message"] == "chore: update Clang issues/PRs markdown" + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_stale_title_cleanup_md_then_clone( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + """Stale titled .md is removed from md_export (via new_files), then clone uses post-cleanup disk map.""" + raw, md, clone_root = raw_and_md + sub = md / "issues" / "2024" / "2024-01" + sub.mkdir(parents=True) + new_path = sub / "#1 - New title.md" + old_path = sub / "#1 - Old title.md" + new_path.write_text("new", encoding="utf-8") + old_path.write_text("old", encoding="utf-8") + + clone_sub = clone_root / "issues" / "2024" / "2024-01" + clone_sub.mkdir(parents=True) + (clone_sub / "#1 - Old title.md").write_text("stale on clone", encoding="utf-8") + + new_files = {"issues/2024/2024-01/#1 - New title.md": str(new_path)} + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", new_files) + + assert not old_path.is_file() + assert new_path.is_file() + copied = clone_sub / "#1 - New title.md" + assert copied.is_file() + assert copied.read_text(encoding="utf-8") == "new" + assert not (clone_sub / "#1 - Old title.md").is_file() + mock_push.assert_called_once() + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_push_failure_raises_command_error( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + raw, md, _clone_root = raw_and_md + err = subprocess.CalledProcessError(1, ["git", "push"]) + err.stderr = "rejected" + err.stdout = "" + mock_push.side_effect = err + + with _author_settings(raw): + with pytest.raises(CommandError, match="Git push failed"): + publish_clang_markdown(md, "acme", "priv", "main", {}) + + +@pytest.mark.django_db +def test_publish_clang_markdown_invalid_owner(raw_and_md): + raw, md, _ = raw_and_md + with _author_settings(raw): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_clang_markdown(md, "evil/org", "priv", "main", {}) + + +@pytest.mark.django_db +def test_publish_clang_markdown_overlap_errors(tmp_path: Path): + raw = tmp_path / "raw" + raw.mkdir() + clone = raw / "clang_github_tracker" / "acme" / "priv" + clone.mkdir(parents=True) + (clone / ".git").mkdir() + with _author_settings(raw): + with pytest.raises(CommandError, match="must not overlap"): + publish_clang_markdown(clone, "acme", "priv", "main", {}) + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.clone_repo") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_clones_when_no_git_dir( + _token, + mock_clone, + _prepare, + _pull, + _reset, + mock_push, + tmp_path: Path, +): + """Missing .git triggers clone_repo; mock creates minimal repo after rmtree.""" + raw = tmp_path / "raw" + raw.mkdir() + md = tmp_path / "md" + md.mkdir() + clone = raw / "clang_github_tracker" / "acme" / "priv" + clone.mkdir(parents=True) + + def _clone_side_effect(_slug, dest, **_kw): + dest = Path(dest) + if dest.exists(): + import shutil + + shutil.rmtree(dest) + dest.mkdir(parents=True) + (dest / ".git").mkdir() + + mock_clone.side_effect = _clone_side_effect + + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", {}) + mock_clone.assert_called_once() diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py new file mode 100644 index 00000000..2e4120e5 --- /dev/null +++ b/clang_github_tracker/tests/test_services.py @@ -0,0 +1,239 @@ +"""Tests for clang_github_tracker.services.""" + +from datetime import timedelta + +import pytest +from django.utils import timezone + +from clang_github_tracker import services as clang_services +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem + + +@pytest.mark.django_db +def test_upsert_issue_item_rejects_bool_and_non_positive(): + t0 = timezone.now() + with pytest.raises(ValueError, match="positive integer"): + clang_services.upsert_issue_item( + True, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + with pytest.raises(ValueError, match="positive integer"): + clang_services.upsert_issue_item( + 0, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + assert ClangGithubIssueItem.objects.count() == 0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_skips_bool_does_not_upsert_as_issue_one(): + t0 = timezone.now() + ins, _ = clang_services.upsert_issue_items_batch([(True, False, t0, t0)]) + assert ins == 0 + assert not ClangGithubIssueItem.objects.filter(number=1).exists() + + +@pytest.mark.django_db +def test_upsert_issue_item_create_and_update_bumps_updated_at(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + _, created = clang_services.upsert_issue_item( + 42, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + assert created is True + row = ClangGithubIssueItem.objects.get(number=42) + first_updated = row.updated_at + + _, created2 = clang_services.upsert_issue_item( + 42, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t1, + ) + assert created2 is False + row.refresh_from_db() + assert row.github_updated_at == t1 + assert row.updated_at >= first_updated + + +@pytest.mark.django_db +def test_watermarks_empty(): + assert clang_services.get_issue_item_watermark() is None + assert clang_services.get_commit_watermark() is None + assert clang_services.start_after_watermark(None) is None + + +@pytest.mark.django_db +def test_upsert_commits_batch_create_and_update(): + sha_a = "a" * 40 + sha_b = "b" * 40 + t0 = timezone.now() - timedelta(days=1) + t1 = timezone.now() + ins, upd = clang_services.upsert_commits_batch([(sha_a, t0), (sha_b, t0)]) + assert ins == 2 and upd == 0 + row = ClangGithubCommit.objects.get(sha=sha_a) + first_updated = row.updated_at + ins2, upd2 = clang_services.upsert_commits_batch([(sha_a, t1)]) + assert ins2 == 0 and upd2 == 1 + row.refresh_from_db() + assert row.github_committed_at == t1 + assert row.updated_at >= first_updated + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_create_and_update(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + ins, upd = clang_services.upsert_issue_items_batch( + [(10, False, t0, t0), (11, True, t0, t0)] + ) + assert ins == 2 and upd == 0 + row = ClangGithubIssueItem.objects.get(number=10) + first_updated = row.updated_at + ins2, upd2 = clang_services.upsert_issue_items_batch([(10, False, t0, t1)]) + assert ins2 == 0 and upd2 == 1 + row.refresh_from_db() + assert row.github_updated_at == t1 + assert row.updated_at >= first_updated + + +@pytest.mark.django_db +def test_upsert_commits_batch_dedupes_sha_by_case(): + """Uppercase and lowercase hex refer to the same commit; merge timestamps in one row.""" + sha_lower = "abcdef" + "0" * 34 + sha_upper = "ABCDEF" + "0" * 34 + t_new = timezone.now() + t_old = t_new - timedelta(days=7) + ins, _ = clang_services.upsert_commits_batch( + [(sha_upper, t_old), (sha_lower, t_new)] + ) + assert ins == 1 + assert ClangGithubCommit.objects.count() == 1 + row = ClangGithubCommit.objects.get(sha=sha_lower) + assert row.github_committed_at == t_new + + +@pytest.mark.django_db +def test_upsert_commit_canonicalizes_sha_to_lowercase(): + sha_mixed = "AbCdEf" + "0" * 34 + t0 = timezone.now() + clang_services.upsert_commit(sha_mixed, github_committed_at=t0) + row = ClangGithubCommit.objects.get(sha=sha_mixed.lower()) + assert row.github_committed_at == t0 + + +@pytest.mark.django_db +def test_upsert_commits_batch_duplicate_sha_keeps_latest_committed_at(): + sha = "c" * 40 + t_new = timezone.now() + t_old = t_new - timedelta(days=7) + clang_services.upsert_commits_batch([(sha, t_new), (sha, t_old)]) + row = ClangGithubCommit.objects.get(sha=sha) + assert row.github_committed_at == t_new + + +@pytest.mark.django_db +def test_upsert_commits_batch_duplicate_sha_none_does_not_wipe_timestamp(): + sha = "d" * 40 + t0 = timezone.now() - timedelta(hours=1) + clang_services.upsert_commits_batch([(sha, t0), (sha, None)]) + assert ClangGithubCommit.objects.get(sha=sha).github_committed_at == t0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_duplicate_number_keeps_latest_github_updated_at(): + t_base = timezone.now() - timedelta(days=5) + t_new = timezone.now() + t_old = t_new - timedelta(days=1) + clang_services.upsert_issue_items_batch( + [ + (7, False, t_base, t_new), + (7, False, t_base, t_old), + ] + ) + row = ClangGithubIssueItem.objects.get(number=7) + assert row.github_updated_at == t_new + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_duplicate_merges_is_pr_or(): + t0 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(8, False, t0, t0), (8, True, t0, t0)]) + assert ClangGithubIssueItem.objects.get(number=8).is_pull_request is True + + +@pytest.mark.django_db +def test_upsert_issue_item_merge_keeps_pr_and_timestamps_when_incoming_partial(): + t_created = timezone.now() - timedelta(days=10) + t_updated = timezone.now() - timedelta(days=3) + clang_services.upsert_issue_item( + 99, + is_pull_request=True, + github_created_at=t_created, + github_updated_at=t_updated, + ) + clang_services.upsert_issue_item( + 99, + is_pull_request=False, + github_created_at=None, + github_updated_at=None, + ) + row = ClangGithubIssueItem.objects.get(number=99) + assert row.is_pull_request is True + assert row.github_created_at == t_created + assert row.github_updated_at == t_updated + + +@pytest.mark.django_db +def test_upsert_issue_item_merge_github_updated_at_max(): + t_old = timezone.now() - timedelta(days=5) + t_new = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_item( + 100, + is_pull_request=False, + github_created_at=t_old, + github_updated_at=t_new, + ) + clang_services.upsert_issue_item( + 100, + is_pull_request=False, + github_created_at=None, + github_updated_at=t_old, + ) + assert ClangGithubIssueItem.objects.get(number=100).github_updated_at == t_new + + +@pytest.mark.django_db +def test_upsert_commit_merge_preserves_committed_at_when_incoming_none(): + sha = "e" * 40 + t0 = timezone.now() - timedelta(hours=2) + clang_services.upsert_commit(sha, github_committed_at=t0) + clang_services.upsert_commit(sha, github_committed_at=None) + assert ClangGithubCommit.objects.get(sha=sha).github_committed_at == t0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_merge_with_db_preserves_updated_when_incoming_none(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(20, False, t0, t1)]) + clang_services.upsert_issue_items_batch([(20, False, None, None)]) + row = ClangGithubIssueItem.objects.get(number=20) + assert row.github_created_at == t0 + assert row.github_updated_at == t1 + assert row.is_pull_request is False + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_merge_with_db_keeps_pr_once_true(): + t0 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(21, True, t0, t0)]) + clang_services.upsert_issue_items_batch([(21, False, t0, t0)]) + assert ClangGithubIssueItem.objects.get(number=21).is_pull_request is True diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 6e3c36e8..cf0338a0 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -1,34 +1,90 @@ -"""Tests for clang_github_tracker.state (no DB).""" +"""Tests for clang_github_tracker.state_manager (DB-backed date resolution).""" -from unittest.mock import patch +from datetime import timedelta +import pytest +from django.utils import timezone from clang_github_tracker import state_manager as clang_state +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem -def test_parse_iso_valid(): - """parse_iso returns datetime for valid ISO strings.""" - dt = clang_state.parse_iso("2024-01-15T10:30:00Z") - assert dt is not None - assert dt.year == 2024 and dt.month == 1 and dt.day == 15 - dt2 = clang_state.parse_iso("2024-06-01T00:00:00+00:00") - assert dt2 is not None - assert dt2.month == 6 - - -def test_parse_iso_invalid_or_empty(): - """parse_iso returns None for empty or invalid input.""" - assert clang_state.parse_iso(None) is None - assert clang_state.parse_iso("") is None - assert clang_state.parse_iso(" ") is None - assert clang_state.parse_iso("not-a-date") is None - - -def test_compute_state_from_raw_empty_dir(tmp_path): - """When raw repo dir does not exist, compute_state_from_raw returns nulls.""" - with patch("clang_github_tracker.state_manager.get_raw_repo_dir") as m: - m.return_value = tmp_path / "nonexistent_repo_dir" - result = clang_state.compute_state_from_raw() - assert result[clang_state.KEY_LAST_COMMIT_DATE] is None - assert result[clang_state.KEY_LAST_ISSUE_DATE] is None - assert result[clang_state.KEY_LAST_PR_DATE] is None +@pytest.mark.django_db +def test_resolve_empty_db_no_since_until(): + """Empty tables → None starts; end None until caller passes --until.""" + ClangGithubIssueItem.objects.all().delete() + ClangGithubCommit.objects.all().delete() + sc, si, end = clang_state.resolve_start_end_dates(None, None) + assert sc is None and si is None + assert end is None + + +@pytest.mark.django_db +def test_resolve_db_watermark_plus_one_millisecond(): + """Max github fields drive start = max + 1ms (API lower bound).""" + base = timezone.now() - timedelta(days=1) + ClangGithubIssueItem.objects.create( + number=1, + is_pull_request=False, + github_created_at=base, + github_updated_at=base, + ) + ClangGithubCommit.objects.create( + sha="a" * 40, + github_committed_at=base, + ) + sc, si, _end = clang_state.resolve_start_end_dates(None, None) + delta = timedelta(milliseconds=1) + assert sc == base + delta + assert si == base + delta + + +@pytest.mark.django_db +def test_resolve_both_since_until_closed_window(): + """Both bounds valid → same since for commit and item; until as end.""" + since = timezone.now() - timedelta(days=10) + until = timezone.now() - timedelta(days=5) + sc, si, end = clang_state.resolve_start_end_dates(since, until) + assert sc == since + assert si == since + assert end == until + + +@pytest.mark.django_db +def test_resolve_invalid_range_clears_bounds(caplog): + """since > until → warning and DB-based resolution.""" + wm = timezone.now() - timedelta(hours=1) + ClangGithubIssueItem.objects.create( + number=99, + is_pull_request=False, + github_updated_at=wm, + ) + ClangGithubCommit.objects.create( + sha="c" * 40, + github_committed_at=wm, + ) + since = timezone.now() + until = timezone.now() - timedelta(days=1) + with caplog.at_level("WARNING"): + sc, si, end = clang_state.resolve_start_end_dates(since, until) + assert any("invalid date range" in r.getMessage() for r in caplog.records) + assert end is None + delta = timedelta(milliseconds=1) + assert sc == wm + delta + assert si == wm + delta + + +@pytest.mark.django_db +def test_resolve_since_floor_without_until(): + """Only since: both starts equal the explicit since; DB watermarks are ignored.""" + base = timezone.now() - timedelta(days=30) + ClangGithubIssueItem.objects.create( + number=2, + is_pull_request=False, + github_updated_at=base, + ) + since = timezone.now() - timedelta(days=1) + sc, si, end = clang_state.resolve_start_end_dates(since, None) + assert sc == since + assert si == since + assert end is None diff --git a/clang_github_tracker/workspace.py b/clang_github_tracker/workspace.py index 349d73eb..a7d495a9 100644 --- a/clang_github_tracker/workspace.py +++ b/clang_github_tracker/workspace.py @@ -1,9 +1,9 @@ """ -Workspace paths for clang_github_tracker: state file and raw GitHub activity dir. +Workspace paths for clang_github_tracker: md export, raw GitHub JSON. Layout: - workspace/clang_github_activity/ - - state.json + workspace/github_activity_tracker/ + - md_export/ (generated Markdown for GitHub publish) workspace/raw/github_activity_tracker/// - commits/, issues/, prs/ """ @@ -15,11 +15,9 @@ from config.workspace import get_workspace_path -_APP_SLUG = "clang_github_activity" +_APP_SLUG = "github_activity_tracker" _RAW_APP_SLUG = "github_activity_tracker" -STATE_FILENAME = "state.json" - # Repo we sync (raw only, no DB); from settings (env: CLANG_GITHUB_OWNER, CLANG_GITHUB_REPO) OWNER = settings.CLANG_GITHUB_OWNER REPO = settings.CLANG_GITHUB_REPO @@ -37,15 +35,10 @@ def _sanitize_segment(value: str, label: str) -> str: def get_workspace_root() -> Path: - """Return workspace/clang_github_activity/; creates dir if missing.""" + """Return workspace/clang_github_tracker/; creates dir if missing.""" return get_workspace_path(_APP_SLUG) -def get_state_path() -> Path: - """Return workspace/clang_github_activity/state.json. Parent dir created on first write.""" - return get_workspace_root() / STATE_FILENAME - - def get_raw_root() -> Path: """Return workspace/raw/github_activity_tracker/; creates dirs if missing.""" path = get_workspace_path("raw") / _RAW_APP_SLUG diff --git a/config/settings.py b/config/settings.py index d76dfe8e..cecb131d 100644 --- a/config/settings.py +++ b/config/settings.py @@ -67,6 +67,8 @@ "clang_github_tracker", "cppa_slack_tracker", "discord_activity_tracker", + "wg21_paper_tracker", + "cppa_youtube_script_tracker", "slack_event_handler", ] @@ -159,6 +161,8 @@ "cppa_slack_tracker", "discord_activity_tracker", "boost_mailing_list_tracker", + "wg21_paper_tracker", + "cppa_youtube_script_tracker", "shared", ) WORKSPACE_DIR.mkdir(parents=True, exist_ok=True) @@ -167,10 +171,10 @@ # ============================================================================= # Clang GitHub Tracker -# Syncs llvm/llvm-project (issues, PRs, commits) to raw workspace only (no DB). -# After sync, updated issues/PRs are exported as Markdown and pushed to the -# private repo below. If OWNER or NAME is not set, upload is skipped and an -# error is logged. +# Syncs llvm/llvm-project (issues, PRs, commits) to raw + DB. +# Markdown export push target: CLANG_GITHUB_CONTEXT_REPO_OWNER / _NAME / _BRANCH +# (separate from CLANG_GITHUB_OWNER + CLANG_GITHUB_REPO, the upstream llvm source). +# If context owner or name is unset, push is skipped and an error is logged. # Folder structure: issues/YYYY/YYYY-MM/#N - title.md (no repo prefix) # ============================================================================= # Boost GitHub owner (used by boost_library_tracker preprocessors for Pinecone sync) @@ -192,6 +196,8 @@ PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() or "aws" # Chunking and batching PINECONE_BATCH_SIZE = env.int("PINECONE_BATCH_SIZE", default=96) +# Parallel threads for Pinecone metadata-only updates (update_documents); lower if you hit 429s. +PINECONE_UPDATE_MAX_WORKERS = env.int("PINECONE_UPDATE_MAX_WORKERS", default=8) PINECONE_CHUNK_SIZE = env.int("PINECONE_CHUNK_SIZE", default=1000) PINECONE_CHUNK_OVERLAP = env.int("PINECONE_CHUNK_OVERLAP", default=200) PINECONE_MIN_TEXT_LENGTH = env.int("PINECONE_MIN_TEXT_LENGTH", default=50) @@ -205,6 +211,13 @@ env("PINECONE_SPARSE_MODEL", default="pinecone-sparse-english-v0") or "pinecone-sparse-english-v0" ).strip() or "pinecone-sparse-english-v0" +# Slack → Pinecone namespace/app_type prefix (cppa_pinecone_sync / slack pipelines) +PINECONE_SLACK_NAMESPACE_PREFIX = ( + env("PINECONE_SLACK_NAMESPACE_PREFIX", default="slack") or "slack" +).strip() or "slack" +PINECONE_SLACK_APP_TYPE_PREFIX = ( + env("PINECONE_SLACK_APP_TYPE_PREFIX", default="slack") or "slack" +).strip() or "slack" # Pinecone sync: app_type and namespace per app (used when CLI does not pass --pinecone-app-type/--pinecone-namespace) # Boost Mailing List Tracker @@ -238,15 +251,17 @@ CLANG_GITHUB_REPO = ( env("CLANG_GITHUB_REPO", default="llvm-project") or "llvm-project" ).strip() or "llvm-project" -CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", default="") or "" +CLANG_GITHUB_CONTEXT_REPO_OWNER = ( + env("CLANG_GITHUB_CONTEXT_REPO_OWNER", default="") or "" ).strip() -CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", default="") or "" +CLANG_GITHUB_CONTEXT_REPO_NAME = ( + env("CLANG_GITHUB_CONTEXT_REPO_NAME", default="") or "" ).strip() -CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", default="main") or "main" +CLANG_GITHUB_CONTEXT_REPO_BRANCH = ( + env("CLANG_GITHUB_CONTEXT_REPO_BRANCH", default="") or "" ).strip() +# Markdown publish: persistent git clone under RAW_DIR/clang_github_tracker///; +# clone/pull/push use GITHUB_TOKEN_WRITE (via get_github_token write); GIT_AUTHOR_* for commits. # GitHub tokens (multiple use cases: scraping, write) # - GITHUB_TOKEN: fallback when a specific token is not set @@ -289,19 +304,28 @@ env("BOOST_LIBRARY_TRACKER_REPO_BRANCH", default="master") or "master" ).strip() -# Settings for publishing boost_library_usage_dashboard +# ============================================================================= +# Boost Library Usage Dashboard +# run_boost_library_usage_dashboard writes artifacts under the workspace, then +# optionally publishes to the GitHub repo below (unless --skip-publish). Clone, +# pull, and push use GITHUB_TOKEN_WRITE. If PUBLISH_OWNER / PUBLISH_REPO are +# unset, publish is skipped (CLI --owner / --repo can override). GIT_AUTHOR_* +# set commit author for that push only (via git env vars, not git config). +# ============================================================================= BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", default="") or "" ).strip() BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", default="") or "" ).strip() -BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN = ( - env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN", default="") or "" -).strip() or GITHUB_TOKEN_WRITE BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", default="") or "" ).strip() +GIT_AUTHOR_NAME = (env("GIT_AUTHOR_NAME", default="unknown") or "unknown").strip() +GIT_AUTHOR_EMAIL = ( + env("GIT_AUTHOR_EMAIL", default="unknown@noreply.github.com") + or "unknown@noreply.github.com" +).strip() # Slack (bot + app token for operations.slack_ops and cppa_slack_transcript_tracker) @@ -311,44 +335,30 @@ SLACK_TEAM_ID = (env("SLACK_TEAM_ID", default="") or "").strip() -def _slack_bot_token_from_env(): - """Build a dict of team_id -> bot token from SLACK_TEAM_IDS and SLACK_BOT_TOKEN_ env vars.""" - out = {} +def _slack_team_ids_from_env(): + """Comma-separated SLACK_TEAM_IDS → non-empty team id strings.""" ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() if not ids_raw: - return out - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue - key = f"SLACK_BOT_TOKEN_{tid}" - token = (env(key, default="") or "").strip() - if token: - out[tid] = token - return out + return [] + return [tid.strip() for tid in ids_raw.split(",") if tid.strip()] -SLACK_BOT_TOKEN = _slack_bot_token_from_env() - - -def _slack_app_token_from_env(): - """Build a dict of team_id -> app token from SLACK_TEAM_IDS and SLACK_APP_TOKEN_ env vars.""" +def _slack_per_team_tokens_from_env(env_key_prefix: str): + """ + Build team_id -> token from SLACK_TEAM_IDS and ``{prefix}_{team_id}`` env vars + (e.g. prefix SLACK_BOT_TOKEN → SLACK_BOT_TOKEN_T123). + """ out = {} - ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() - if not ids_raw: - return out - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue - key = f"SLACK_APP_TOKEN_{tid}" + for tid in _slack_team_ids_from_env(): + key = f"{env_key_prefix}_{tid}" token = (env(key, default="") or "").strip() if token: out[tid] = token return out -SLACK_APP_TOKEN = _slack_app_token_from_env() +SLACK_BOT_TOKEN = _slack_per_team_tokens_from_env("SLACK_BOT_TOKEN") +SLACK_APP_TOKEN = _slack_per_team_tokens_from_env("SLACK_APP_TOKEN") def _slack_team_scope_from_env(): @@ -359,14 +369,8 @@ def _slack_team_scope_from_env(): If SLACK_TEAM_SCOPE_ is missing or empty, that team gets [0, 1] (both). """ out = {} - ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() - if not ids_raw: - return out valid_scopes = {0, 1} - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue + for tid in _slack_team_ids_from_env(): key = f"SLACK_TEAM_SCOPE_{tid}" raw = (env(key, default="") or "").strip() if not raw: @@ -449,6 +453,16 @@ def _slack_team_scope_from_env(): ) ).resolve() +# WG21 Paper Tracker Configuration +WG21_GITHUB_DISPATCH_ENABLED = env.bool("WG21_GITHUB_DISPATCH_ENABLED", default=False) +WG21_GITHUB_DISPATCH_REPO = (env("WG21_GITHUB_DISPATCH_REPO", default="") or "").strip() +WG21_GITHUB_DISPATCH_TOKEN = ( + env("WG21_GITHUB_DISPATCH_TOKEN", default="") or "" +).strip() +WG21_GITHUB_DISPATCH_EVENT_TYPE = ( + env("WG21_GITHUB_DISPATCH_EVENT_TYPE", default="wg21_papers_convert") or "" +).strip() or "wg21_papers_convert" + # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) LOG_FILE = env("LOG_FILE", default="app.log") @@ -469,6 +483,8 @@ def _slack_team_scope_from_env(): ENABLE_ERROR_NOTIFICATIONS = env.bool("ENABLE_ERROR_NOTIFICATIONS", default=False) DISCORD_WEBHOOK_URL = env("DISCORD_WEBHOOK_URL", default="") SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL", default="") +# Post to webhooks after deploy (see make notify / send_startup_notification) +ENABLE_STARTUP_NOTIFICATIONS = env.bool("ENABLE_STARTUP_NOTIFICATIONS", default=True) LOGGING = { "version": 1, @@ -539,39 +555,6 @@ def _slack_team_scope_from_env(): ) CELERY_BEAT_SCHEDULE = {} -# ============================================================================= -# Pinecone (cppa_pinecone_sync) - vector index for RAG sync -# ============================================================================= -# Public API key (default). Used when instance=public or unset. -PINECONE_API_KEY = (env("PINECONE_API_KEY", default="") or "").strip() -# Private API key. Used when instance=private. -PINECONE_PRIVATE_API_KEY = (env("PINECONE_PRIVATE_API_KEY", default="") or "").strip() -# Index name (required for sync). Set in .env to enable Slack/mailing list → Pinecone. -PINECONE_INDEX_NAME = (env("PINECONE_INDEX_NAME", default="") or "").strip() -PINECONE_ENVIRONMENT = ( - env("PINECONE_ENVIRONMENT", default="us-east-1") or "us-east-1" -).strip() -PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() -PINECONE_BATCH_SIZE = int(env("PINECONE_BATCH_SIZE", default="96") or "96") -PINECONE_CHUNK_SIZE = int(env("PINECONE_CHUNK_SIZE", default="1000") or "1000") -PINECONE_CHUNK_OVERLAP = int(env("PINECONE_CHUNK_OVERLAP", default="200") or "200") -PINECONE_MIN_TEXT_LENGTH = int(env("PINECONE_MIN_TEXT_LENGTH", default="50") or "50") -PINECONE_MIN_WORDS = int(env("PINECONE_MIN_WORDS", default="5") or "5") -PINECONE_SLACK_NAMESPACE_PREFIX = ( - env("PINECONE_SLACK_NAMESPACE_PREFIX", default="slack") or "slack" -).strip() -PINECONE_SLACK_APP_TYPE_PREFIX = ( - env("PINECONE_SLACK_APP_TYPE_PREFIX", default="slack") or "slack" -).strip() -PINECONE_DENSE_MODEL = ( - env("PINECONE_DENSE_MODEL", default="multilingual-e5-large") - or "multilingual-e5-large" -).strip() -PINECONE_SPARSE_MODEL = ( - env("PINECONE_SPARSE_MODEL", default="pinecone-sparse-english-v0") - or "pinecone-sparse-english-v0" -).strip() - # GitHub activity tracker: Redis for ETag cache (conditional GET). Use separate DB index. # To persist the cache across restarts, enable Redis persistence (RDB or AOF) in redis.conf: # RDB: leave default "save" rules (e.g. save 900 1) and set dir/dbfilename. @@ -598,3 +581,22 @@ def _slack_team_scope_from_env(): "level": "ERROR", } LOGGING["root"]["handlers"].append("slack") + +# YouTube (cppa_youtube_script_tracker) +YOUTUBE_API_KEY = (env("YOUTUBE_API_KEY", default="") or "").strip() +YOUTUBE_PINECONE_NAMESPACE = ( + env("YOUTUBE_PINECONE_NAMESPACE", default="youtube-scripts") or "youtube-scripts" +).strip() +YOUTUBE_DEFAULT_PUBLISHED_AFTER = ( + env("YOUTUBE_DEFAULT_PUBLISHED_AFTER", default="") or "" +).strip() +# You can add your own Django apps here by adding them to the EXTRA_INSTALLED_APPS list in config/local_settings.py. +try: + from . import local_settings as _local_settings + + _LOCAL_EXTRA_INSTALLED_APPS = tuple( + getattr(_local_settings, "EXTRA_INSTALLED_APPS", ()) + ) +except ImportError: + _LOCAL_EXTRA_INSTALLED_APPS = () +INSTALLED_APPS = [*INSTALLED_APPS, *_LOCAL_EXTRA_INSTALLED_APPS] diff --git a/config/test_settings.py b/config/test_settings.py index b724fd9f..c1c3f837 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -50,7 +50,7 @@ for _slug in ( "github_activity_tracker", "boost_library_tracker", - "clang_github_activity", + "clang_github_tracker", "discord_activity_tracker", "shared", ): @@ -65,3 +65,7 @@ # Clang GitHub Tracker (tests use defaults) CLANG_GITHUB_OWNER = "llvm" CLANG_GITHUB_REPO = "llvm-project" +# Do not inherit publish target from developer .env (avoids real git / token in tests). +CLANG_GITHUB_CONTEXT_REPO_OWNER = "" +CLANG_GITHUB_CONTEXT_REPO_NAME = "" +CLANG_GITHUB_CONTEXT_REPO_BRANCH = "" diff --git a/core/management/__init__.py b/core/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/management/commands/__init__.py b/core/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/management/commands/send_startup_notification.py b/core/management/commands/send_startup_notification.py new file mode 100644 index 00000000..7d5253fa --- /dev/null +++ b/core/management/commands/send_startup_notification.py @@ -0,0 +1,210 @@ +""" +Post deploy/startup status to Slack and Discord webhooks (DB, Celery beat schedule, workers). +Invoked after health checks via: DEPLOY_BRANCH= make notify +""" + +import json +import logging +import os +import sys +from datetime import datetime, timezone +from urllib import request +from urllib.error import URLError + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db import connection + +from celery.schedules import crontab, schedule as celery_interval_schedule + +from config.celery import app as celery_app + +logger = logging.getLogger(__name__) + +BEAT_LINES_CAP = 25 + + +def _crontab_field_to_sorted_ints(field): + if field is None: + return None + if isinstance(field, int): + return [field] + if isinstance(field, (set, frozenset)): + return sorted(field) + if hasattr(field, "__iter__") and not isinstance(field, (str, bytes)): + try: + return sorted(int(x) for x in field) + except (TypeError, ValueError): + return None + return None + + +def _crontab_is_universal_star(field): + if field is None: + return True + s = str(field).strip() + return s in ("*", "**", "None") + + +def describe_celery_schedule(sched) -> str: + if isinstance(sched, celery_interval_schedule): + run_every = getattr(sched, "run_every", None) + if run_every is not None: + minutes = int(run_every.total_seconds() // 60) + return f"every {minutes} minutes" + return repr(sched) + if isinstance(sched, crontab): + hours = _crontab_field_to_sorted_ints(sched.hour) + minutes = _crontab_field_to_sorted_ints(sched.minute) + parts = [] + if ( + hours is not None + and minutes is not None + and len(hours) == 1 + and len(minutes) == 1 + ): + parts.append(f"{hours[0]:02d}:{minutes[0]:02d} UTC") + else: + parts.append(f"crontab hour={sched.hour!r} minute={sched.minute!r}") + if not _crontab_is_universal_star(getattr(sched, "day_of_week", None)): + parts.append(f"dow={sched.day_of_week!r}") + if not _crontab_is_universal_star(getattr(sched, "day_of_month", None)): + parts.append(f"dom={sched.day_of_month!r}") + if not _crontab_is_universal_star(getattr(sched, "month_of_year", None)): + parts.append(f"moy={sched.month_of_year!r}") + return " ".join(parts) + return repr(sched) + + +def collect_beat_lines(beat_schedule: dict) -> tuple[list[str], int]: + lines = [] + total = len(beat_schedule) + for name in sorted(beat_schedule.keys()): + entry = beat_schedule[name] + task = entry.get("task", "?") + sch = entry.get("schedule") + cadence = describe_celery_schedule(sch) if sch is not None else "?" + lines.append(f"- `{name}` → `{task}` @ {cadence}") + return lines, total + + +def post_discord(webhook_url: str, title: str, description: str) -> None: + embed = { + "title": title, + "description": description[:4000], + "color": 0x3498DB, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + payload = {"username": "Boost Data Collector", "embeds": [embed]} + data = json.dumps(payload).encode("utf-8") + req = request.Request( + webhook_url, + data=data, + headers={"Content-Type": "application/json"}, + ) + with request.urlopen(req, timeout=15) as resp: + if resp.status not in (200, 204): + logger.warning("Discord webhook returned status %s", resp.status) + + +def post_slack(webhook_url: str, title: str, text: str) -> None: + blocks = [ + { + "type": "header", + "text": {"type": "plain_text", "text": title, "emoji": True}, + }, + {"type": "section", "text": {"type": "mrkdwn", "text": f"```{text[:2800]}```"}}, + ] + payload = { + "username": "Boost Data Collector", + "blocks": blocks, + "icon_emoji": ":white_check_mark:", + } + data = json.dumps(payload).encode("utf-8") + req = request.Request( + webhook_url, + data=data, + headers={"Content-Type": "application/json"}, + ) + with request.urlopen(req, timeout=15) as resp: + if resp.status != 200: + logger.warning("Slack webhook returned status %s", resp.status) + + +class Command(BaseCommand): + help = "Send startup/deploy notification to Slack and Discord webhooks." + + def handle(self, *args, **options): + if not getattr(settings, "ENABLE_STARTUP_NOTIFICATIONS", True): + logger.info( + "Startup notifications disabled (ENABLE_STARTUP_NOTIFICATIONS)." + ) + return + + discord_url = (getattr(settings, "DISCORD_WEBHOOK_URL", None) or "").strip() + slack_url = (getattr(settings, "SLACK_WEBHOOK_URL", None) or "").strip() + if not discord_url and not slack_url: + logger.info( + "No DISCORD_WEBHOOK_URL or SLACK_WEBHOOK_URL; skipping notification." + ) + return + + notify_at = datetime.now(timezone.utc) + branch = os.environ.get("DEPLOY_BRANCH", "").strip() or "unknown" + + db_line = "DB: error" + try: + connection.ensure_connection() + tables = connection.introspection.table_names() + db_line = f"DB: OK, {len(tables)} tables" + except Exception as exc: + db_line = f"DB: failed ({exc})" + + beat_schedule = dict(celery_app.conf.beat_schedule or {}) + beat_lines, beat_total = collect_beat_lines(beat_schedule) + shown = beat_lines[:BEAT_LINES_CAP] + beat_block = "\n".join(shown) + if beat_total > len(shown): + beat_block += f"\n… and {beat_total - len(shown)} more" + + worker_line = "Celery workers: unknown" + try: + insp = celery_app.control.inspect(timeout=5.0) + pong = insp.ping() if insp else None + n = len(pong) if pong else 0 + worker_line = f"Celery workers: {n} (ping)" + except Exception as exc: + worker_line = f"Celery workers: inspect failed ({exc})" + + text_body = ( + f"Time (UTC): {notify_at.strftime('%Y-%m-%d %H:%M:%S')}\n" + f"Branch: {branch}\n" + f"{db_line}\n" + f"{worker_line}\n" + f"Celery beat entries: {beat_total}\n" + f"{beat_block if beat_block else '(none)'}" + ) + + title = "Boost Data Collector — stack healthy" + errors = [] + if discord_url: + try: + post_discord(discord_url, title, text_body) + except URLError as e: + errors.append(f"Discord: {e}") + except Exception as e: + errors.append(f"Discord: {e}") + if slack_url: + try: + post_slack(slack_url, title, text_body) + except URLError as e: + errors.append(f"Slack: {e}") + except Exception as e: + errors.append(f"Slack: {e}") + + if errors: + for err in errors: + logger.error("%s", err) + sys.exit(1) + + logger.info("Startup notification sent.") diff --git a/core/tests/test_boost_version_operations.py b/core/tests/test_boost_version_operations.py new file mode 100644 index 00000000..5f31b8dc --- /dev/null +++ b/core/tests/test_boost_version_operations.py @@ -0,0 +1,78 @@ +"""Tests for core.utils.boost_version_operations.""" + +import pytest + +from core.utils.boost_version_operations import ( + compare_boost_version_tuples, + compare_encoded_versions, + compare_loose_version_strings, + decode_boost_version, + encode_boost_version, + encode_boost_version_string, + loose_version_tuple, + normalize_boost_version_string, + parse_boost_version_string, + parse_stable_boost_release_tag, +) + + +def test_encode_decode_round_trip(): + assert encode_boost_version(1, 86, 0) == 108_600 + assert decode_boost_version(108_600) == (1, 86, 0) + assert decode_boost_version(1_00_900) == (1, 9, 0) + + +def test_encode_boost_version_string(): + assert encode_boost_version_string("1.86.0") == 108_600 + assert encode_boost_version_string("boost-1.10.0") == 101_000 + assert encode_boost_version_string("1_56_0") == 105_600 + + +def test_parse_invalid_returns_none(): + assert parse_boost_version_string("") is None + assert parse_boost_version_string("not-a-version") is None + + +def test_encode_rejects_out_of_range(): + with pytest.raises(ValueError): + encode_boost_version(1, 1000, 0) + with pytest.raises(ValueError): + encode_boost_version(1, 0, 100) + + +def test_loose_version_tuple_empty_and_digits(): + assert loose_version_tuple("") == (0, 0, 0) + assert loose_version_tuple("1.82.x") == (1, 82, 0) + assert loose_version_tuple("release-2.1.9-extra") == (2, 1, 9) + + +def test_normalize_boost_version_string(): + assert normalize_boost_version_string("1.82") == "1.82.0" + assert normalize_boost_version_string("0.99") is None + assert normalize_boost_version_string("") is None + assert normalize_boost_version_string("boost-1.2.3") == "1.2.3" + + +def test_compare_boost_version_tuples(): + assert compare_boost_version_tuples((1, 0, 0), (2, 0, 0)) == -1 + assert compare_boost_version_tuples((1, 82, 0), (1, 82, 0)) == 0 + assert compare_boost_version_tuples((2, 0, 0), (1, 99, 99)) == 1 + + +def test_compare_loose_version_strings(): + assert compare_loose_version_strings("1.0", "2.0") == -1 + assert compare_loose_version_strings("1.82.x", "1.81.0") == 1 + + +def test_compare_encoded_versions(): + assert compare_encoded_versions(100_000, 200_000) == -1 + assert compare_encoded_versions(108_600, 108_600) == 0 + + +def test_parse_stable_boost_release_tag(): + min_v = (1, 16, 1) + assert parse_stable_boost_release_tag("boost-1.90.0", min_v) == "boost-1.90.0" + assert parse_stable_boost_release_tag("boost-1.16.1", min_v) == "boost-1.16.1" + assert parse_stable_boost_release_tag("boost-1.16.0", min_v) is None + assert parse_stable_boost_release_tag("boost-1.90.0-beta", min_v) is None + assert parse_stable_boost_release_tag("", min_v) is None diff --git a/core/utils/boost_version_operations.py b/core/utils/boost_version_operations.py new file mode 100644 index 00000000..7383590c --- /dev/null +++ b/core/utils/boost_version_operations.py @@ -0,0 +1,197 @@ +""" +Boost release version helpers: macro packing, strict parse for Pinecone keys, +loose parse for sorting messy strings, normalization, and comparisons. + +**Strict (``BOOST_VERSION`` macro / Pinecone metadata keys)** — numeric packing:: + + major * 100_000 + minor * 100 + patch + +Requires ``minor <= 999`` and ``patch <= 99`` for collision-free encoding. +Use :func:`parse_boost_version_string` and :func:`encode_boost_version_string`. + +**Loose (sorting / analytics)** — digit runs per dot-separated segment; empty +input → ``(0, 0, 0)``. Handles strings like ``release-2.1.9-extra``. Use +:func:`loose_version_tuple` / :func:`compare_loose_version_strings`. + +**GitHub stable tags** — exact ``boost-X.Y.Z`` (no ``-beta`` / ``-rc`` suffix). +Use :func:`parse_stable_boost_release_tag` with a caller-supplied minimum tuple. +""" + +from __future__ import annotations + +import re + +# --- Macro packing (BOOST_VERSION / version.hpp) -------------------------------- + +MAJOR_MULTIPLIER = 100_000 +MINOR_MULTIPLIER = 100 + +_MAX_MINOR = 999 +_MAX_PATCH = 99 + +_VERSION_STRIP_PREFIX = re.compile(r"^boost[-_]", re.IGNORECASE) + + +def encode_boost_version(major: int, minor: int, patch: int) -> int: + """Return the packed integer (``major * 100_000 + minor * 100 + patch``).""" + if major < 0 or minor < 0 or patch < 0: + raise ValueError( + f"Version components must be non-negative, got {major}.{minor}.{patch}" + ) + if minor > _MAX_MINOR or patch > _MAX_PATCH: + raise ValueError( + f"Encoding requires minor <= {_MAX_MINOR} and patch <= {_MAX_PATCH} " + f"(got {major}.{minor}.{patch})" + ) + return major * MAJOR_MULTIPLIER + minor * MINOR_MULTIPLIER + patch + + +def decode_boost_version(encoded: int) -> tuple[int, int, int]: + """Split a packed ``BOOST_VERSION``-style integer into (major, minor, patch).""" + if encoded < 0: + raise ValueError(f"encoded version must be non-negative, got {encoded}") + major = encoded // MAJOR_MULTIPLIER + minor = (encoded // MINOR_MULTIPLIER) % 1000 + patch = encoded % MINOR_MULTIPLIER + return major, minor, patch + + +def parse_boost_version_string(version_str: str) -> tuple[int, int, int] | None: + """ + Parse ``1.86.0``, ``boost-1.86.0``, or ``1_86_0`` into (major, minor, patch). + + Missing minor/patch segments default to 0. Returns None if unparseable or + out of encodable range. + """ + if not version_str or not str(version_str).strip(): + return None + s = _VERSION_STRIP_PREFIX.sub("", str(version_str).strip()) + s = s.replace("_", ".") + parts = s.split(".") + if not parts or not parts[0].strip(): + return None + try: + major = int(parts[0].strip()) + minor = int(parts[1].strip()) if len(parts) > 1 else 0 + patch = int(parts[2].strip()) if len(parts) > 2 else 0 + except ValueError: + return None + if minor > _MAX_MINOR or patch > _MAX_PATCH: + return None + if major < 0 or minor < 0 or patch < 0: + return None + return major, minor, patch + + +def encode_boost_version_string(version_str: str) -> int | None: + """Parse *version_str* and return the packed int, or None if invalid.""" + triple = parse_boost_version_string(version_str) + if triple is None: + return None + major, minor, patch = triple + try: + return encode_boost_version(major, minor, patch) + except ValueError: + return None + + +# --- Loose tuple (sorting / dirty strings) ------------------------------------ + + +def loose_version_tuple(version: str) -> tuple[int, int, int]: + """ + Parse *version* to (major, minor, patch) for sorting. + + Each segment uses the longest digit run only (e.g. ``1.82.x`` → ``(1, 82, 0)``). + Empty string → ``(0, 0, 0)``. + """ + if not version: + return (0, 0, 0) + parts = version.strip().split(".") + out: list[int] = [] + for part in parts[:3]: + number = "".join(c for c in part if c.isdigit()) + out.append(int(number) if number else 0) + while len(out) < 3: + out.append(0) + return tuple(out[:3]) + + +# --- Normalization ------------------------------------------------------------ + + +def normalize_boost_version_string(version_str: str) -> str | None: + """ + Normalize a version string for comparison; returns None if invalid or pre-1.0. + + Strips ``boost-`` prefix, maps ``-`` / ``_`` to ``.``, appends ``.0`` when + only two segments are present. + """ + version = (version_str or "").strip().replace("boost-", "") + version = version.replace("-", ".").replace("_", ".") + if not version or version.startswith("0."): + return None + if len(version.split(".")) == 2: + version = f"{version}.0" + return version + + +# --- Comparison --------------------------------------------------------------- + + +def compare_boost_version_tuples( + a: tuple[int, int, int], b: tuple[int, int, int] +) -> int: + """Return -1 if a < b, 0 if equal, 1 if a > b.""" + if a < b: + return -1 + if a > b: + return 1 + return 0 + + +def compare_loose_version_strings(left: str, right: str) -> int: + """Compare two version strings using :func:`loose_version_tuple`.""" + return compare_boost_version_tuples( + loose_version_tuple(left), loose_version_tuple(right) + ) + + +def compare_encoded_versions(i: int, j: int) -> int: + """ + Compare two packed ints from :func:`encode_boost_version`. + + Do not use for arbitrary integers that were not produced by that encoding. + """ + if i < j: + return -1 + if i > j: + return 1 + return 0 + + +# --- GitHub stable release tags (boostorg/boost) -------------------------------- + +BOOST_STABLE_RELEASE_TAG_PATTERN = re.compile(r"^boost-(\d+)\.(\d+)\.(\d+)$") + + +def parse_stable_boost_release_tag( + tag_name: str, + min_version: tuple[int, int, int], +) -> str | None: + """ + If *tag_name* matches ``boost-X.Y.Z`` (three numeric parts only) and the + version is >= *min_version*, return the canonical tag (e.g. ``boost-1.90.0``). + + Returns ``None`` for empty names, non-matching patterns, or versions below + *min_version*. + """ + if not tag_name: + return None + m = BOOST_STABLE_RELEASE_TAG_PATTERN.match(tag_name.strip()) + if not m: + return None + major, minor, patch = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if compare_boost_version_tuples((major, minor, patch), min_version) == -1: + return None + return f"boost-{major}.{minor}.{patch}" diff --git a/core/utils/datetime_parsing.py b/core/utils/datetime_parsing.py index fa4710c4..f1bfb1eb 100644 --- a/core/utils/datetime_parsing.py +++ b/core/utils/datetime_parsing.py @@ -4,6 +4,21 @@ from datetime import datetime, timezone +from django.utils import timezone as django_timezone + + +def ensure_aware_utc(dt: datetime | None) -> datetime | None: + """ + Normalize a datetime for ``DateTimeField`` when ``USE_TZ`` is True. + + Naive values are treated as UTC. Aware values are converted to UTC. + """ + if dt is None: + return None + if django_timezone.is_naive(dt): + return django_timezone.make_aware(dt, django_timezone.utc) + return dt.astimezone(django_timezone.utc) + def parse_iso_datetime(raw: str | None) -> datetime | None: """ diff --git a/cppa_slack_tracker/utils/text_processing.py b/core/utils/text_processing.py similarity index 85% rename from cppa_slack_tracker/utils/text_processing.py rename to core/utils/text_processing.py index 1dd9f25e..e801d39c 100644 --- a/cppa_slack_tracker/utils/text_processing.py +++ b/core/utils/text_processing.py @@ -1,16 +1,16 @@ """ -Text processing utilities for Slack message preprocessing. +Shared text cleaning and light filtering helpers. -Adapted from workspace/utility.py for Django integration. -Contains functions for cleaning, filtering, and validating Slack message content. +Used by ``cppa_slack_tracker`` (and other apps) for normalizing message text and +optional greeting/noise phrase removal. Default word lists are Slack-oriented +(``SLACK_*`` constants). """ -import re -import logging -from typing import Optional, Iterable, FrozenSet - -logger = logging.getLogger(__name__) +from __future__ import annotations +import html +import re +from typing import Iterable, FrozenSet, Optional # Default greeting/unessential words for filter_sentence (Slack message cleaning) SLACK_GREETING_WORDS: FrozenSet[str] = frozenset( @@ -89,12 +89,13 @@ ) -def clean_text(text: str, remove_extra_spaces: bool = True) -> str: +def clean_text(text: str | None, remove_extra_spaces: bool = True) -> str: """ Clean and normalize text content. - Removes invisible characters, normalizes line breaks, and optionally - removes extra whitespace. + Removes invisible characters, decodes HTML character references (e.g. + ``&``, ``'``, ``/``), fixes a few common bare entities without + ``;``, normalizes line breaks, and optionally removes extra whitespace. Args: text: Input text to clean @@ -118,18 +119,22 @@ def clean_text(text: str, remove_extra_spaces: bool = True) -> str: .replace("\u200b", "") .replace("\u200c", "") .replace("\u200d", "") + .replace("\xa0", " ") + .replace("\u2002", " ") + .replace("\u2003", " ") + .replace("\u2026", "...") + .replace("\u202f", " ") ) + text = html.unescape(text) + # Normalize line breaks text = re.sub(r"\r\n", "\n", text) # Windows line breaks text = re.sub(r"\r", "\n", text) # Old Mac line breaks if remove_extra_spaces: - # Remove multiple spaces text = re.sub(r" +", " ", text) - # Remove multiple newlines (keep max 2) text = re.sub(r"\n{3,}", "\n\n", text) - # Remove spaces at start/end of lines text = "\n".join(line.strip() for line in text.split("\n")) return text.strip() @@ -191,7 +196,7 @@ def filter_sentence( return sentence_lower.strip() -def validate_content_length(content: str, min_length: int = 50) -> bool: +def validate_content_length(content: str | None, min_length: int = 50) -> bool: """ Validate that content meets minimum length requirement. diff --git a/cppa_pinecone_sync/ingestion.py b/cppa_pinecone_sync/ingestion.py index d1ee0681..27b897d1 100644 --- a/cppa_pinecone_sync/ingestion.py +++ b/cppa_pinecone_sync/ingestion.py @@ -14,6 +14,7 @@ import hashlib import logging import re +from concurrent.futures import ThreadPoolExecutor, as_completed from enum import Enum from typing import Any, Optional @@ -73,16 +74,22 @@ def __init__(self, instance: PineconeInstance = PineconeInstance.PUBLIC) -> None self.sparse_model: str = getattr( settings, "PINECONE_SPARSE_MODEL", "pinecone-sparse-english-v0" ) + # Parallel metadata updates (update_documents); 1 = sequential. Cap with Pinecone rate limits. + self.update_max_workers: int = max( + 1, int(getattr(settings, "PINECONE_UPDATE_MAX_WORKERS", 8)) + ) self._setup_client() self._initialize_text_splitter() self._setup_indexes() logger.info( - "PineconeIngestion: dense_model=%s, sparse_model=%s, instance=%s", + "PineconeIngestion: dense_model=%s, sparse_model=%s, instance=%s, " + "update_max_workers=%d", self.dense_model, self.sparse_model, self.instance.value, + self.update_max_workers, ) @property @@ -375,7 +382,7 @@ def _prepare_batch_records( ) record: dict[str, Any] = {"id": doc_id, "chunk_text": text} record.update(metadata) - record.pop("table_ids", None) + record.pop("source_ids", None) records.append(record) return records @@ -406,7 +413,7 @@ def _mark_batch_failed( meta = doc.metadata or {} failed.append( { - "ids": meta.get("table_ids", ""), + "ids": meta.get("source_ids") or meta.get("table_ids", ""), "reason": f"Batch upsert failed: {error}", } ) @@ -479,24 +486,51 @@ def _update_all_batches( continue batch_failed_count = 0 - for update in batch_updates: - try: - self._update_single_record(update, namespace) - updated_count += 1 - except Exception as e: - error_msg = ( - f"Error updating metadata for batch {batch_num} " - f"record {update['id']}: {e}" - ) - logger.error(error_msg) - errors.append(error_msg) - failed_docs.append( - { - "ids": update.get("ids", ""), - "reason": f"Metadata update failed: {e}", - } - ) - batch_failed_count += 1 + if self.update_max_workers <= 1: + for update in batch_updates: + try: + self._update_single_record(update, namespace) + updated_count += 1 + except Exception as e: + error_msg = ( + f"Error updating metadata for batch {batch_num} " + f"record {update['id']}: {e}" + ) + logger.error(error_msg) + errors.append(error_msg) + failed_docs.append( + { + "ids": update.get("ids", ""), + "reason": f"Metadata update failed: {e}", + } + ) + batch_failed_count += 1 + else: + max_workers = min(self.update_max_workers, len(batch_updates)) + with ThreadPoolExecutor(max_workers=max_workers) as pool: + future_to_update = { + pool.submit(self._update_single_record, u, namespace): u + for u in batch_updates + } + for fut in as_completed(future_to_update): + update = future_to_update[fut] + try: + fut.result() + updated_count += 1 + except Exception as e: + error_msg = ( + f"Error updating metadata for batch {batch_num} " + f"record {update['id']}: {e}" + ) + logger.error(error_msg) + errors.append(error_msg) + failed_docs.append( + { + "ids": update.get("ids", ""), + "reason": f"Metadata update failed: {e}", + } + ) + batch_failed_count += 1 logger.info( "Updated metadata for batch %d: %d/%d documents", @@ -527,9 +561,9 @@ def _prepare_batch_updates( record_idx=len(updates), ) - source_ids = metadata.get("table_ids", "") - metadata.pop("table_ids", None) - updates.append({"id": doc_id, "set_metadata": metadata, "ids": source_ids}) + track_ids = metadata.get("source_ids") or metadata.get("table_ids", "") + metadata.pop("source_ids", None) + updates.append({"id": doc_id, "set_metadata": metadata, "ids": track_ids}) return updates diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 33e5f2e0..616e1291 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -1,54 +1,34 @@ """ - Main entry point for Pinecone sync. - - Other apps call ``sync_to_pinecone()`` to push their data into Pinecone. - This module orchestrates the full flow: - - 1. Collect failed IDs and last sync timestamp from the database. - 2. Call the caller-provided preprocessing function to get documents. - 3. Upsert documents to Pinecone via PineconeIngestion. - 4. Update the fail list and sync status in the database. - - -See docs/pinecone_sync.md for the full specification. - +See docs/Pinecone_preprocess_guideline.md (preprocess contract) and +docs/service_api/cppa_pinecone_sync.md (fail list / sync status services). """ from __future__ import annotations - import logging - from datetime import datetime - from typing import Any, Callable, Optional - from django.db import transaction - from . import services - from .ingestion import PineconeIngestion, PineconeInstance - logger = logging.getLogger(__name__) - # Module-level singletons keyed by instance; created on first use so that # Django settings are available and Pinecone libraries are imported only when # needed. - _ingestion_pool: dict[str, PineconeIngestion] = {} @@ -56,24 +36,17 @@ def _get_ingestion( instance: PineconeInstance = PineconeInstance.PUBLIC, ) -> PineconeIngestion: """Return (and lazily create) a PineconeIngestion for *instance*.""" - key = instance.value - if key not in _ingestion_pool: - _ingestion_pool[key] = PineconeIngestion(instance=instance) - return _ingestion_pool[key] # Type alias for the preprocessing function that callers must supply. - # Signature: - # - legacy: (failed_ids, final_sync_at) -> (raw_documents, is_chunked) - -# - metadata update: (failed_ids, final_sync_at) -> (raw_documents, is_chunked, metas_to_update) - +# - metadata update: (failed_ids, final_sync_at) -> +# (raw_documents, is_chunked, metas_to_update) PreprocessFn = Callable[ [list[str], Optional[datetime]], tuple[list[dict[str, Any]], bool] @@ -83,7 +56,6 @@ def _get_ingestion( def _empty_sync_result() -> dict[str, Any]: """Return the standard empty sync result dict.""" - return { "upserted": 0, "updated": 0, @@ -97,32 +69,32 @@ def _empty_sync_result() -> dict[str, Any]: } -def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: +def _build_documents_from_raw( + raw_documents: list[dict[str, Any]], +) -> list[Any]: """Convert preprocess output to langchain Documents; skip items missing doc_id/url.""" - from langchain_core.documents import Document documents: list[Any] = [] - for item in raw_documents: - content = item.get("content", "") - metadata = dict(item.get("metadata") or {}) - - ids_str = metadata.get("ids") or item.get("ids", "") or "" + ids_str = ( + metadata.get("source_ids") + or metadata.get("ids") + or item.get("source_ids", "") + or item.get("ids", "") + or "" + ) if "doc_id" not in metadata and "url" not in metadata: - logger.warning( - "Skipping document with ids=%s: metadata must contain 'doc_id' or 'url'", + "Skipping document with source_ids=%s: metadata must contain 'doc_id' or 'url'", ids_str, ) - continue metadata["table_ids"] = ids_str - documents.append(Document(page_content=content, metadata=metadata)) return documents @@ -130,53 +102,28 @@ def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: def _extract_new_failed_ids(result: dict[str, Any]) -> list[str]: """Collect source IDs from failed_documents in the upsert result.""" - new_failed_ids: list[str] = [] - for failed_doc in result.get("failed_documents", []): - ids_str = failed_doc.get("ids", "") - if ids_str: - new_failed_ids.extend( fid.strip() for fid in ids_str.split(",") if fid.strip() ) - return new_failed_ids def _extract_source_ids_from_documents(documents: list[Any]) -> list[str]: - """ - - Collect deduplicated source IDs from Document.metadata.table_ids in order. - - """ - - seen: set[str] = set() - + """Collect deduplicated source IDs from Document.metadata.table_ids in order.""" source_ids: list[str] = [] - for doc in documents: - table_ids = str(doc.metadata.get("table_ids", "")).strip() - if not table_ids: - continue - for token in table_ids.split(","): - source_id = token.strip() - - if not source_id or source_id in seen: - + if not source_id or source_id in source_ids: continue - - seen.add(source_id) - source_ids.append(source_id) - return source_ids @@ -188,42 +135,23 @@ def sync_to_pinecone( ) -> dict[str, Any]: """Run a full Pinecone sync cycle for *app_type*. - - This is the **public API** that other apps call. - - Args: - app_type: Identifies the data source (e.g. "slack", "mailing"). Stored as - - CharField in - - PineconeFailList and PineconeSyncStatus. - + CharField in PineconeFailList and PineconeSyncStatus. namespace: Pinecone namespace to upsert into. - - preprocess_fn: A callable returning ``(list[dict], is_chunked)``. Each dict - - must have ``content`` and ``metadata``; ``metadata`` must contain - - ``doc_id`` or ``url``. See docs/Pinecone_preprocess_guideline.md. - + preprocess_fn: A callable returning ``(list[dict], is_chunked)`` or + ``(list[dict], is_chunked, metas_to_update)``. Each dict must have + ``content`` and ``metadata``; ``metadata`` must contain ``doc_id`` + or ``url``. See docs/Pinecone_preprocess_guideline.md. instance: Which Pinecone API key to use (public or private). - Default is public. - - Returns: - dict with keys: upserted, updated, total, failed_count, failed_ids, - errors, update_errors. - """ - logger.info( "sync_to_pinecone: starting app_type=%s namespace=%s instance=%s", app_type, @@ -232,9 +160,7 @@ def sync_to_pinecone( ) failed_ids = services.get_failed_ids(app_type) - final_sync_at = services.get_final_sync_at(app_type) - logger.debug( "app_type=%s: %d previously failed IDs, final_sync_at=%s", app_type, @@ -245,86 +171,84 @@ def sync_to_pinecone( preprocess_result = preprocess_fn(failed_ids, final_sync_at) if len(preprocess_result) == 2: - raw_documents, is_chunked = preprocess_result - metas_to_update: list[dict[str, Any]] = [] - elif len(preprocess_result) == 3: - raw_documents, is_chunked, metas_to_update = preprocess_result - else: - raise ValueError( "preprocess_fn must return either " "(raw_documents, is_chunked) or " "(raw_documents, is_chunked, metas_to_update)" ) - if not raw_documents: - + if not raw_documents and not metas_to_update: logger.info( - "sync_to_pinecone: preprocess returned 0 documents for app_type=%s", + "sync_to_pinecone: preprocess returned 0 upsert docs and 0 metadata " + "updates for app_type=%s", app_type, ) - - services.update_sync_status(app_type) - return _empty_sync_result() - documents = _build_documents_from_raw(raw_documents) - - if not documents: - - services.update_sync_status(app_type) + upsert_documents = _build_documents_from_raw(raw_documents) if raw_documents else [] + meta_documents = ( + _build_documents_from_raw(metas_to_update) if metas_to_update else [] + ) + if not upsert_documents and not meta_documents: + logger.info( + "sync_to_pinecone: no valid documents after filtering for app_type=%s", + app_type, + ) return _empty_sync_result() - attempted_source_ids = _extract_source_ids_from_documents(documents) - ingestion = _get_ingestion(instance) + attempted_source_ids = _extract_source_ids_from_documents(upsert_documents) - result = ingestion.upsert_documents( - documents=documents, namespace=namespace, is_chunked=is_chunked - ) + if upsert_documents: + result = ingestion.upsert_documents( + documents=upsert_documents, + namespace=namespace, + is_chunked=is_chunked, + ) + else: + result = { + "upserted": 0, + "total": 0, + "errors": [], + "failed_documents": [], + } update_result: dict[str, Any] = {"updated": 0, "errors": []} - if metas_to_update: - - documents = _build_documents_from_raw(metas_to_update) - - if not documents: - - services.update_sync_status(app_type) - - return _empty_sync_result() - + if meta_documents: update_result = ingestion.update_documents( - documents=documents, namespace=namespace, is_chunked=is_chunked + documents=meta_documents, + namespace=namespace, + is_chunked=is_chunked, + ) + elif metas_to_update: + logger.warning( + "sync_to_pinecone: metas_to_update produced no valid documents " + "for app_type=%s (skipped metadata update)", + app_type, ) new_failed_ids = _extract_new_failed_ids(result) with transaction.atomic(): - services.clear_failed_ids(app_type) - if new_failed_ids: - services.record_failed_ids(app_type, new_failed_ids) - - logger.warning( - "app_type=%s: %d source IDs recorded as failed", - app_type, - len(new_failed_ids), - ) + logger.warning( + "app_type=%s: %d source IDs recorded as failed", + app_type, + len(new_failed_ids), + ) services.update_sync_status(app_type) failed_source_ids_set = set(new_failed_ids) - successful_source_ids = [ source_id for source_id in attempted_source_ids diff --git a/cppa_pinecone_sync/tests/test_sync.py b/cppa_pinecone_sync/tests/test_sync.py index bd223953..c17b293d 100644 --- a/cppa_pinecone_sync/tests/test_sync.py +++ b/cppa_pinecone_sync/tests/test_sync.py @@ -97,6 +97,33 @@ def test_build_documents_from_raw_mixed(): assert result[1].page_content == "c" +def test_build_documents_from_raw_metadata_source_ids(): + """metadata['source_ids'] is copied to table_ids (preferred over legacy top-level ids).""" + raw = [ + { + "content": "hello", + "metadata": {"doc_id": "doc-1", "source_ids": "42"}, + }, + ] + result = _build_documents_from_raw(raw) + assert len(result) == 1 + assert result[0].metadata.get("table_ids") == "42" + + +def test_build_documents_from_raw_source_ids_overrides_top_level_ids(): + """When both are present, metadata['source_ids'] wins for table_ids.""" + raw = [ + { + "ids": "legacy", + "content": "x", + "metadata": {"doc_id": "d", "source_ids": "from-meta"}, + }, + ] + result = _build_documents_from_raw(raw) + assert len(result) == 1 + assert result[0].metadata.get("table_ids") == "from-meta" + + # --- _extract_new_failed_ids --- @@ -160,8 +187,9 @@ def test_extract_new_failed_ids_skips_empty(): @pytest.mark.django_db -def test_sync_to_pinecone_empty_preprocess_returns_early(app_type): - """sync_to_pinecone returns empty result and updates status when preprocess returns no docs.""" +def test_sync_to_pinecone_empty_preprocess_returns_early(): + """No upsert/metadata work: empty result and PineconeSyncStatus is not touched.""" + app_type = "test_empty_preprocess_sync" def preprocess(_failed_ids, _final_sync_at): return [], False @@ -170,12 +198,12 @@ def preprocess(_failed_ids, _final_sync_at): assert result["upserted"] == 0 assert result["total"] == 0 assert result["failed_ids"] == [] - assert services.get_final_sync_at(app_type) is not None + assert services.get_final_sync_at(app_type) is None @pytest.mark.django_db def test_sync_to_pinecone_all_invalid_docs_returns_early(app_type): - """sync_to_pinecone returns empty result when all raw docs lack doc_id/url.""" + """sync_to_pinecone returns empty result and does not update sync status when all raw docs lack doc_id/url.""" def preprocess(_failed_ids, _final_sync_at): return [ @@ -185,6 +213,7 @@ def preprocess(_failed_ids, _final_sync_at): result = sync_to_pinecone(app_type, "ns", preprocess) assert result["upserted"] == 0 assert result["total"] == 0 + assert services.get_final_sync_at(app_type) is None @pytest.mark.django_db @@ -223,6 +252,47 @@ def preprocess(_failed_ids, _final_sync_at): assert services.get_final_sync_at(app_type) is not None +@pytest.mark.django_db +@patch("cppa_pinecone_sync.sync._get_ingestion") +def test_sync_to_pinecone_metadata_only_calls_update(mock_get_ingestion, app_type): + """Empty upsert batch but non-empty metas_to_update still runs update_documents.""" + mock_ingestion = MagicMock() + mock_ingestion.update_documents.return_value = { + "updated": 3, + "total": 3, + "errors": [], + "failed_documents": [], + } + mock_get_ingestion.return_value = mock_ingestion + + def preprocess(_failed_ids, _final_sync_at): + return ( + [], + False, + [ + { + "ids": "10", + "content": "metadata-only body " * 20, + "metadata": {"doc_id": "h1"}, + }, + ], + ) + + result = sync_to_pinecone(app_type, "meta_ns", preprocess) + + mock_ingestion.upsert_documents.assert_not_called() + mock_ingestion.update_documents.assert_called_once() + call_kw = mock_ingestion.update_documents.call_args[1] + assert call_kw["namespace"] == "meta_ns" + assert len(call_kw["documents"]) == 1 + assert result["upserted"] == 0 + assert result["total"] == 0 + assert result["failed_count"] == 0 + assert result["updated"] == 3 + assert result["failed_ids"] == [] + assert services.get_final_sync_at(app_type) is not None + + @pytest.mark.django_db @patch("cppa_pinecone_sync.sync._get_ingestion") def test_sync_to_pinecone_returns_metadata_update_result(mock_get_ingestion, app_type): diff --git a/cppa_slack_tracker/preprocessor.py b/cppa_slack_tracker/preprocessor.py index b2f0e05b..ce5df6c7 100644 --- a/cppa_slack_tracker/preprocessor.py +++ b/cppa_slack_tracker/preprocessor.py @@ -22,7 +22,7 @@ from django.db.models import Q from cppa_slack_tracker.models import SlackMessage -from cppa_slack_tracker.utils.text_processing import ( +from cppa_slack_tracker.utils import ( clean_text, filter_sentence, validate_content_length, @@ -382,8 +382,7 @@ def preprocess_slack_for_pinecone( "thread_ts": thread_ts if thread_ts else "", "group_size": len(message_ids), "team_id": team_id, - # ids should reference message timestamps for sync bookkeeping - "ids": ",".join(message_ids), + "source_ids": ",".join(message_ids), } docs.append({"content": content, "metadata": metadata}) diff --git a/cppa_slack_tracker/tests/test_preprocessor.py b/cppa_slack_tracker/tests/test_preprocessor.py index 5a20f6aa..c44dcad6 100644 --- a/cppa_slack_tracker/tests/test_preprocessor.py +++ b/cppa_slack_tracker/tests/test_preprocessor.py @@ -182,9 +182,9 @@ def test_preprocessor_document_shape_and_metadata_fields( assert "timestamp" in target["metadata"] assert "team_id" in target["metadata"] - # Check ids field for retry tracking - assert "ids" in target["metadata"] - assert isinstance(target["metadata"]["ids"], str) + # Check source_ids for retry tracking (Pinecone_preprocess_guideline.md) + assert "source_ids" in target["metadata"] + assert isinstance(target["metadata"]["source_ids"], str) @pytest.mark.django_db diff --git a/cppa_slack_tracker/tests/test_text_processing.py b/cppa_slack_tracker/tests/test_text_processing.py deleted file mode 100644 index d0cd44ff..00000000 --- a/cppa_slack_tracker/tests/test_text_processing.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Tests for cppa_slack_tracker.utils.text_processing.""" - -from cppa_slack_tracker.utils.text_processing import ( - clean_text, - filter_sentence, - validate_content_length, - SLACK_GREETING_WORDS, - SLACK_UNESSENTIAL_WORDS, -) - - -def test_clean_text_removes_invisible_characters(): - """clean_text removes soft hyphens and zero-width spaces.""" - text = "Hello\xadworld\u200b" - result = clean_text(text) - assert result == "Helloworld" - - -def test_clean_text_normalizes_line_breaks(): - """clean_text normalizes different line break styles.""" - text = "Line1\r\nLine2\rLine3\nLine4" - result = clean_text(text) - assert "\r" not in result - assert result.count("\n") == 3 - - -def test_clean_text_removes_extra_spaces(): - """clean_text removes multiple spaces when remove_extra_spaces=True.""" - text = "Hello world test" - result = clean_text(text, remove_extra_spaces=True) - assert result == "Hello world test" - - -def test_clean_text_limits_newlines(): - """clean_text limits consecutive newlines to max 2.""" - text = "Line1\n\n\n\n\nLine2" - result = clean_text(text, remove_extra_spaces=True) - assert result == "Line1\n\nLine2" - - -def test_clean_text_strips_line_whitespace(): - """clean_text removes spaces at start/end of lines.""" - text = " Line1 \n Line2 " - result = clean_text(text, remove_extra_spaces=True) - assert result == "Line1\nLine2" - - -def test_clean_text_handles_empty_input(): - """clean_text returns empty string for empty input.""" - assert clean_text("") == "" - assert clean_text(None) == "" - - -def test_filter_sentence_removes_greetings(): - """filter_sentence removes greeting words as whole phrases (keeps 'hi' inside 'this').""" - sentence = "Hi there, can you help me with this?" - result = filter_sentence(sentence) - assert result.startswith("there") # standalone "Hi" removed - assert "help" in result - assert "this" in result # "hi" inside "this" is not removed - - -def test_filter_sentence_removes_unessential_words(): - """filter_sentence removes unessential words like 'ok', 'lol'.""" - sentence = "Ok sure, that sounds great lol" - result = filter_sentence(sentence) - # After filtering, should have remaining meaningful content - assert isinstance(result, str) - - -def test_filter_sentence_returns_empty_for_short_result(): - """filter_sentence returns empty string if result is too short.""" - sentence = "Hi ok" # Only greeting and unessential words - result = filter_sentence(sentence, min_words_after=3) - assert result == "" - - -def test_filter_sentence_handles_empty_input(): - """filter_sentence returns empty string for empty input.""" - assert filter_sentence("") == "" - assert filter_sentence(" ") == "" - - -def test_filter_sentence_custom_word_lists(): - """filter_sentence accepts custom greeting and unessential word lists.""" - sentence = "Hello world test example" - result = filter_sentence( - sentence, - greeting_words=["hello"], - unessential_words=["world"], - min_words_after=1, - ) - assert "test" in result or "example" in result - assert "hello" not in result.lower() - assert "world" not in result.lower() - - -def test_validate_content_length_accepts_long_text(): - """validate_content_length returns True for text meeting minimum length.""" - long_text = "This is a much longer text that definitely exceeds the minimum length requirement" - assert validate_content_length(long_text, min_length=50) is True - - -def test_validate_content_length_rejects_short_text(): - """validate_content_length returns False for text below minimum length.""" - short_text = "Hi" - assert validate_content_length(short_text, min_length=50) is False - - -def test_validate_content_length_handles_empty_input(): - """validate_content_length returns False for empty input.""" - assert validate_content_length("") is False - assert validate_content_length(None) is False - - -def test_validate_content_length_strips_whitespace(): - """validate_content_length strips whitespace before checking length.""" - text_with_spaces = " Short " - assert validate_content_length(text_with_spaces, min_length=10) is False - - -def test_slack_greeting_words_constant(): - """SLACK_GREETING_WORDS contains expected greeting words.""" - assert "hi" in SLACK_GREETING_WORDS - assert "hello" in SLACK_GREETING_WORDS - assert "thanks" in SLACK_GREETING_WORDS - assert "goodbye" in SLACK_GREETING_WORDS - - -def test_slack_unessential_words_constant(): - """SLACK_UNESSENTIAL_WORDS contains expected unessential words.""" - assert "ok" in SLACK_UNESSENTIAL_WORDS - assert "lol" in SLACK_UNESSENTIAL_WORDS - assert "yeah" in SLACK_UNESSENTIAL_WORDS - assert "awesome" in SLACK_UNESSENTIAL_WORDS diff --git a/cppa_slack_tracker/utils/__init__.py b/cppa_slack_tracker/utils/__init__.py index fdec86f5..808ff04d 100644 --- a/cppa_slack_tracker/utils/__init__.py +++ b/cppa_slack_tracker/utils/__init__.py @@ -1,13 +1,16 @@ """ Utility functions for cppa_slack_tracker. + +Text processing lives in ``core.utils.text_processing``; re-exported here for +stable import paths (``from cppa_slack_tracker.utils import clean_text``, etc.). """ -from .text_processing import ( +from core.utils.text_processing import ( + SLACK_GREETING_WORDS, + SLACK_UNESSENTIAL_WORDS, clean_text, filter_sentence, validate_content_length, - SLACK_GREETING_WORDS, - SLACK_UNESSENTIAL_WORDS, ) __all__ = [ diff --git a/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py new file mode 100644 index 00000000..5623629c --- /dev/null +++ b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py @@ -0,0 +1,19 @@ +# Generated by Django 4.2.28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="wg21paperauthorprofile", + name="author_alias", + field=models.CharField(blank=True, db_index=True, default="", max_length=255), + preserve_default=False, + ), + ] diff --git a/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py new file mode 100644 index 00000000..7d2018e5 --- /dev/null +++ b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py @@ -0,0 +1,52 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="YoutubeSpeaker", + fields=[ + ( + "baseprofile_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="cppa_user_tracker.baseprofile", + ), + ), + ( + "external_id", + models.CharField(blank=True, max_length=255, null=True), + ), + ("display_name", models.CharField(db_index=True, max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + bases=("cppa_user_tracker.baseprofile",), + ), + migrations.AlterField( + model_name="baseprofile", + name="type", + field=models.CharField( + choices=[ + ("github", "GitHub"), + ("slack", "Slack"), + ("mailing_list", "Mailing list"), + ("wg21", "WG21"), + ("discord", "Discord"), + ("youtube", "YouTube"), + ], + db_index=True, + max_length=20, + ), + ), + ] diff --git a/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py new file mode 100644 index 00000000..721c9a73 --- /dev/null +++ b/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.28 on 2026-03-11 01:57 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_youtubespeaker_alter_baseprofile_type"), + ] + + operations = [ + migrations.AlterField( + model_name="slackuser", + name="slack_user_id", + field=models.CharField(max_length=64, unique=True), + ), + ] diff --git a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py new file mode 100644 index 00000000..199289eb --- /dev/null +++ b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py @@ -0,0 +1,67 @@ +import re + +from django.db import migrations, models + + +def _slugify_speaker_name(name: str) -> str: + """Match cppa_youtube_script_tracker.utils._slugify_speaker_name (no channel/video).""" + s = (name or "").strip().lower() + s = re.sub(r"[^a-z0-9]+", "_", s).strip("_") + return s or "unknown" + + +def populate_external_id(apps, schema_editor): + """Seed external_id using same format as build_speaker_external_id(..., "", "").""" + YoutubeSpeaker = apps.get_model("cppa_user_tracker", "YoutubeSpeaker") + + used = set( + YoutubeSpeaker.objects.exclude(external_id__isnull=True) + .exclude(external_id="") + .values_list("external_id", flat=True) + ) + + for speaker in YoutubeSpeaker.objects.all().order_by("baseprofile_ptr_id"): + if speaker.external_id: + continue + slug = _slugify_speaker_name(speaker.display_name) + candidate = f"youtube:name:{slug}" + if candidate in used: + candidate = f"{candidate}:{speaker.baseprofile_ptr_id}" + speaker.external_id = candidate + speaker.save(update_fields=["external_id"]) + used.add(candidate) + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0006_alter_slackuser_slack_user_id"), + ] + + operations = [ + # Defensive cleanup for previously failed local runs. + migrations.RunSQL( + sql=( + "DROP INDEX IF EXISTS " + "cppa_user_tracker_youtubespeaker_external_id_8b44bffb_like;" + "DROP INDEX IF EXISTS " + "cppa_user_tracker_youtubespeaker_external_id_8b44bffb;" + ), + reverse_sql=migrations.RunSQL.noop, + ), + # Add column if missing (no-op when 0005 already created it; required when + # upgrading from pre-fix 0005 that did not include external_id). + migrations.RunSQL( + sql=( + "ALTER TABLE cppa_user_tracker_youtubespeaker " + "ADD COLUMN IF NOT EXISTS external_id VARCHAR(255) NULL;" + ), + reverse_sql=migrations.RunSQL.noop, + ), + migrations.RunPython(populate_external_id, migrations.RunPython.noop), + migrations.AlterField( + model_name="youtubespeaker", + name="external_id", + field=models.CharField(max_length=255, unique=True), + ), + ] diff --git a/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py new file mode 100644 index 00000000..fcdc4f2b --- /dev/null +++ b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py @@ -0,0 +1,13 @@ +# Merge parallel branches from 0004: WG21 author_alias vs YouTube speaker chain. + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), + ("cppa_user_tracker", "0007_youtubespeaker_external_id"), + ] + + operations = [] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 46be6272..7357c017 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -11,6 +11,7 @@ class ProfileType(models.TextChoices): MAILING_LIST = "mailing_list", "Mailing list" WG21 = "wg21", "WG21" DISCORD = "discord", "Discord" + YOUTUBE = "youtube", "YouTube" class GitHubAccountType(models.TextChoices): @@ -165,6 +166,7 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) display_name = models.CharField(max_length=255, db_index=True, blank=True) + author_alias = models.CharField(max_length=255, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) @@ -183,3 +185,20 @@ def save(self, *args, **kwargs): is_bot = models.BooleanField(default=False) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) + + +class YoutubeSpeaker(BaseProfile): + """YouTube speaker profile. + + Uses external_id as canonical identifier (stable across updates). display_name is + a human-readable field and is not used as the identity key. + """ + + def save(self, *args, **kwargs): + self.type = ProfileType.YOUTUBE + super().save(*args, **kwargs) + + external_id = models.CharField(max_length=255, unique=True) + display_name = models.CharField(max_length=255, db_index=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index a5838940..f2853230 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -26,6 +26,8 @@ MailingListProfile, SlackUser, DiscordProfile, + WG21PaperAuthorProfile, + YoutubeSpeaker, ) @@ -247,7 +249,9 @@ def _get_next_negative_github_account_id() -> int: @transaction.atomic -def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool]: +def get_or_create_slack_user( + user_data: dict[str, Any], +) -> tuple[SlackUser, bool]: """Get or create a SlackUser from Slack API user data. Returns (SlackUser, created). If the user exists, updates username, display_name, and avatar_url from user_data. @@ -350,3 +354,61 @@ def get_or_create_discord_profile( profile.is_bot = is_bot profile.save() return profile, created + + +def get_or_create_wg21_paper_author_profile( + display_name: str, + email: Optional[str] = None, +) -> tuple[WG21PaperAuthorProfile, bool]: + """Get or create a WG21PaperAuthorProfile by display_name, with optional email disambiguation. + + Finds all profiles with the given display_name. If none exist, creates one and adds + email if provided. If one exists, returns it. If multiple exist, and email is + provided, returns the one with that email if any; otherwise returns the first. + """ + display_name_val = (display_name or "").strip() + email_val = (email or "").strip() or None + + candidates = list( + WG21PaperAuthorProfile.objects.filter(display_name=display_name_val).order_by( + "id" + ) + ) + + # Disambiguate by email if provided. + for p in candidates: + if email_val and p.emails.filter(email=email_val).exists(): + return p, False + elif not email_val and not p.emails.exists(): + return p, False + + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) + if email_val: + add_email(profile, email_val, is_primary=True) + return profile, True + + +def get_or_create_youtube_speaker( + external_id: str, + display_name: str = "", + identity: Optional[Identity] = None, +) -> tuple[YoutubeSpeaker, bool]: + """Get or create a YoutubeSpeaker by external_id. Returns (speaker, created). + + Looks up by external_id. On creation, sets identity/display_name if provided. + If the record already exists and a non-empty display_name is provided, updates + display_name when changed. + Raises ValueError if external_id is empty. + """ + external_id_val = (external_id or "").strip() + display_name_val = (display_name or "").strip() + if not external_id_val: + raise ValueError("external_id must not be empty.") + speaker, created = YoutubeSpeaker.objects.get_or_create( + external_id=external_id_val, + defaults={"display_name": display_name_val, "identity": identity}, + ) + if not created and display_name_val and speaker.display_name != display_name_val: + speaker.display_name = display_name_val + speaker.save(update_fields=["display_name", "updated_at"]) + return speaker, created diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index cf614811..0c09e0ce 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -8,6 +8,7 @@ GitHubAccountType, Identity, TempProfileIdentityRelation, + WG21PaperAuthorProfile, ) from cppa_user_tracker import services @@ -569,3 +570,103 @@ def test_get_or_create_mailing_list_profile_strips_display_name_and_email(): assert created is True assert profile.display_name == "Trimmed" assert profile.emails.filter(email="trimmed@example.com").exists() + + +# --- get_or_create_wg21_paper_author_profile --- + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_creates(): + """get_or_create_wg21_paper_author_profile creates new profile when none exist.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="New Author" + ) + assert created is True + assert profile.display_name == "New Author" + assert WG21PaperAuthorProfile.objects.filter(display_name="New Author").count() == 1 + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_with_email_adds_email(): + """get_or_create_wg21_paper_author_profile adds email to new profile when provided.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Author With Email", + email="author@example.com", + ) + assert created is True + assert profile.emails.filter(email="author@example.com").exists() + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): + """get_or_create_wg21_paper_author_profile returns existing profile when exactly one matches.""" + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author" + ) + assert created is False + assert profile.id == existing.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_creates_new_profile(): + """One name match but email not on that profile: creates a new profile with the email. + + Disambiguation only returns an existing row when the email matches or when no email + is passed and the candidate has no emails; otherwise a new profile is created. + """ + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author", + email="solo@example.com", + ) + assert created is True + assert profile.id != existing.id + assert profile.display_name == "Solo Author" + assert profile.emails.filter(email="solo@example.com").exists() + assert ( + WG21PaperAuthorProfile.objects.filter(display_name="Solo Author").count() == 2 + ) + assert not existing.emails.filter(email="solo@example.com").exists() + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first(): + """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + _second = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Dup Name" + ) + assert created is False + assert profile.id == first.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_second(): + """get_or_create_wg21_paper_author_profile returns profile with matching email when multiple match.""" + _first = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + services.add_email(second, "match@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Same Name", + email="match@example.com", + ) + assert created is False + assert profile.id == second.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_creates_new_profile(): + """When multiple match and email matches none, a new profile is created with that email.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + services.add_email(second, "other@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Other Name", + email="nomatch@example.com", + ) + assert created is True + assert profile.id not in (first.id, second.id) + assert profile.display_name == "Other Name" + assert profile.emails.filter(email="nomatch@example.com").exists() diff --git a/cppa_youtube_script_tracker/__init__.py b/cppa_youtube_script_tracker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cppa_youtube_script_tracker/admin.py b/cppa_youtube_script_tracker/admin.py new file mode 100644 index 00000000..67966781 --- /dev/null +++ b/cppa_youtube_script_tracker/admin.py @@ -0,0 +1,56 @@ +from django.contrib import admin +from django.contrib.admin import ModelAdmin + +from .models import ( + CppaTags, + YouTubeChannel, + YouTubeVideo, + YouTubeVideoSpeaker, + YouTubeVideoTags, +) + + +@admin.register(YouTubeChannel) +class YouTubeChannelAdmin(ModelAdmin): + list_display = ("channel_id", "channel_title", "created_at") + search_fields = ("channel_id", "channel_title") + + +@admin.register(YouTubeVideo) +class YouTubeVideoAdmin(ModelAdmin): + list_display = ( + "video_id", + "title", + "channel", + "published_at", + "has_transcript", + "created_at", + ) + list_filter = ("has_transcript", "channel", "published_at") + search_fields = ("video_id", "title", "description", "search_term") + raw_id_fields = ("channel",) + date_hierarchy = "published_at" + + +@admin.register(YouTubeVideoSpeaker) +class YouTubeVideoSpeakerAdmin(ModelAdmin): + list_display = ("id", "video", "speaker", "created_at") + raw_id_fields = ("video", "speaker") + search_fields = ("video__video_id", "video__title", "speaker__display_name") + + +@admin.register(CppaTags) +class CppaTagsAdmin(ModelAdmin): + list_display = ("id", "tag_name") + search_fields = ("tag_name",) + + +@admin.register(YouTubeVideoTags) +class YouTubeVideoTagsAdmin(ModelAdmin): + list_display = ("id", "youtube_video", "cppa_tag") + raw_id_fields = ("youtube_video", "cppa_tag") + search_fields = ( + "youtube_video__video_id", + "youtube_video__title", + "cppa_tag__tag_name", + ) diff --git a/cppa_youtube_script_tracker/apps.py b/cppa_youtube_script_tracker/apps.py new file mode 100644 index 00000000..6565ddac --- /dev/null +++ b/cppa_youtube_script_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class CppaYoutubeScriptTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "cppa_youtube_script_tracker" + verbose_name = "CPPA YouTube Script Tracker" diff --git a/cppa_youtube_script_tracker/fetcher.py b/cppa_youtube_script_tracker/fetcher.py new file mode 100644 index 00000000..e7872a78 --- /dev/null +++ b/cppa_youtube_script_tracker/fetcher.py @@ -0,0 +1,365 @@ +""" +YouTube Data API v3 fetcher for cppa_youtube_script_tracker. + +Adapted from cppa-brain-backend/copilot_data/scrape/youtube_cpp/scraper.py. +Fetches video metadata for C++ channels between published_after and published_before. +""" + +from __future__ import annotations + +import logging +import re +import time +from datetime import datetime, timezone +from typing import Any, Optional + +from django.conf import settings + +logger = logging.getLogger(__name__) + +# Maps channel title to stable YouTube channel ID. +C_PLUS_PLUS_CHANNELS: dict[str, str] = { + "CppCon": "UCMlGfpWw-RUdWX_JbLCukXg", + "Meeting C++": "UCX9pk4YzHFcl3MsHIYBlEKg", + "C++Now": "UCEfngwe09zvd9KAL33YJSQQ", + "Jason Turner": "UCXTpTQHR7li1_HkUyAIUjkQ", + "TheCherno": "UCQ-W1KE9EYfdxhL6S4twUNw", + "Bo Qian": "UCEqgmyWChwmqyRdmnsS24Zw", +} + +_CHANNEL_FOCUSED_TERMS: list[str] = [ + "C++", +] + +# Search-term based discovery (global searches, not tied to one channel ID) +_GLOBAL_SEARCH_TERMS: list[str] = [ + "C++ programming", + "C++ tutorial", + "C++ advanced", + "modern C++", + "C++20", + "C++23", + "C++ templates", + "C++ STL", + "C++ best practices", + "C++ performance", + "Boost C++", +] + +# Famous-figure focused discovery terms. +_FAMOUS_FIGURE_TERMS: list[str] = [ + "Bjarne Stroustrup C++", + "Herb Sutter C++", + "Scott Meyers C++", + "Andrei Alexandrescu C++", + "Nicolai Josuttis C++", + "Chandler Carruth C++", + "Kate Gregory C++", + "Jason Turner C++", + "Sean Parent C++", + "Jonathan Boccara C++", +] + +_MAX_RESULTS_PER_PAGE = 50 +_DELAY_SECONDS = 0.5 +_DEFAULT_MAX_QUERY_PAIRS = 30 + + +class QuotaExceededError(RuntimeError): + """Raised when YouTube Data API quota has been exhausted.""" + + +def _get_api_key() -> str: + """Return YOUTUBE_API_KEY from Django settings. Raises ValueError if missing.""" + key = (getattr(settings, "YOUTUBE_API_KEY", None) or "").strip() + if not key: + raise ValueError( + "YOUTUBE_API_KEY is not set. Add it to your .env or Django settings." + ) + return key + + +def _parse_duration_iso(duration_iso: str) -> int: + """Parse ISO 8601 duration string (e.g. PT1H2M10S) to total seconds.""" + if not duration_iso or duration_iso == "PT": + return 0 + match = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?").match(duration_iso) + if not match: + return 0 + return ( + int(match.group(1) or 0) * 3600 + + int(match.group(2) or 0) * 60 + + int(match.group(3) or 0) + ) + + +def _is_quota_exceeded_error(exc: Exception) -> bool: + text = str(exc).lower() + return "quotaexceeded" in text or "youtube.quota" in text + + +def _get_max_query_pairs() -> int: + """ + Return max number of query pairs for one run. + + Configure with `YOUTUBE_MAX_QUERY_PAIRS` in Django settings/.env. + """ + raw = getattr(settings, "YOUTUBE_MAX_QUERY_PAIRS", _DEFAULT_MAX_QUERY_PAIRS) + try: + value = int(raw) + except (TypeError, ValueError): + value = _DEFAULT_MAX_QUERY_PAIRS + return max(1, value) + + +def _format_video_data( + video_data: dict[str, Any], search_term: str = "" +) -> dict[str, Any]: + """Normalise a YouTube API video resource into a flat metadata dict.""" + snippet = video_data.get("snippet", {}) + statistics = video_data.get("statistics", {}) + content_details = video_data.get("contentDetails", {}) + duration_iso = content_details.get("duration", "PT0S") + view = statistics.get("viewCount") + like = statistics.get("likeCount") + comment = statistics.get("commentCount") + return { + "video_id": video_data.get("id", ""), + "title": snippet.get("title", ""), + "description": snippet.get("description", ""), + "channel_id": snippet.get("channelId", ""), + "channel_title": snippet.get("channelTitle", ""), + "published_at": snippet.get("publishedAt", ""), + "duration_seconds": _parse_duration_iso(duration_iso), + "view_count": int(view) if view is not None else None, + "like_count": int(like) if like is not None else None, + "comment_count": int(comment) if comment is not None else None, + "tags": snippet.get("tags") or [], + "search_term": search_term, + "scraped_at": datetime.now(tz=timezone.utc).isoformat(), + } + + +def _to_rfc3339(dt: datetime) -> str: + """Format a datetime as RFC 3339 (required by YouTube API publishedAfter/Before).""" + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _build_queries(channel_title: Optional[str]) -> list[tuple[str, Optional[str]]]: + """Return list of (query_text, channel_id_or_None) pairs to iterate over. + + Strategy: + - If channel_title is specified: + - Known channel ID: run several C++ terms scoped to that channel. + - Unknown channel: run keyword searches with that channel title. + - Otherwise: + - Run channel-scoped queries for known channels. + - Run global term-based discovery queries. + - Run famous-figure discovery queries. + """ + + def _dedupe_pairs( + pairs: list[tuple[str, Optional[str]]], + ) -> list[tuple[str, Optional[str]]]: + seen: set[tuple[str, Optional[str]]] = set() + out: list[tuple[str, Optional[str]]] = [] + for query_text, ch_id in pairs: + key = (query_text.strip().casefold(), ch_id) + if key in seen: + continue + seen.add(key) + out.append((query_text, ch_id)) + return out + + if channel_title: + ch_id = C_PLUS_PLUS_CHANNELS.get(channel_title) + if not ch_id: + logger.warning( + "fetch_videos: channel_title %r not in C_PLUS_PLUS_CHANNELS; " + "falling back to keyword search", + channel_title, + ) + return _dedupe_pairs( + [(channel_title, None), (f"{channel_title} C++", None)] + ) + return _dedupe_pairs([(term, ch_id) for term in _CHANNEL_FOCUSED_TERMS]) + + pairs: list[tuple[str, Optional[str]]] = [] + for ch_id in C_PLUS_PLUS_CHANNELS.values(): + pairs.extend((term, ch_id) for term in _CHANNEL_FOCUSED_TERMS) + + pairs.extend((term, None) for term in _FAMOUS_FIGURE_TERMS) + pairs.extend((term, None) for term in _GLOBAL_SEARCH_TERMS) + return _dedupe_pairs(pairs) + + +def _fetch_search_page( + youtube: Any, + query_text: str, + ch_id: Optional[str], + after_str: str, + before_str: str, + page_token: Optional[str], +) -> Optional[dict[str, Any]]: + """Execute one search().list() call; return the response or None on error. + + Raises QuotaExceededError when API quota is exhausted. + """ + params: dict[str, Any] = { + "q": query_text, + "part": "id,snippet", + "type": "video", + "maxResults": _MAX_RESULTS_PER_PAGE, + "order": "date", + "publishedAfter": after_str, + "publishedBefore": before_str, + } + if ch_id: + params["channelId"] = ch_id + if page_token: + params["pageToken"] = page_token + try: + time.sleep(_DELAY_SECONDS) + return youtube.search().list(**params).execute() # type: ignore[union-attr] + except Exception as exc: # pylint: disable=broad-exception-caught + if _is_quota_exceeded_error(exc): + raise QuotaExceededError("YouTube API quota exceeded.") from exc + logger.error("fetch_videos: search API error: %s", exc) + return None + + +def _fetch_video_details(youtube: Any, video_ids: list[str]) -> list[dict[str, Any]]: + """Execute one videos().list() call; return items or empty list on error. + + Raises QuotaExceededError when API quota is exhausted. + """ + try: + time.sleep(_DELAY_SECONDS) + resp = ( + youtube.videos() # type: ignore[union-attr] + .list(part="snippet,statistics,contentDetails", id=",".join(video_ids)) + .execute() + ) + return resp.get("items", []) + except Exception as exc: # pylint: disable=broad-exception-caught + if _is_quota_exceeded_error(exc): + raise QuotaExceededError("YouTube API quota exceeded.") from exc + logger.error("fetch_videos: videos.list API error: %s", exc) + return [] + + +def _process_one_channel_query( + youtube: Any, + query_text: str, + ch_id: Optional[str], + after_str: str, + before_str: str, + seen_ids: set[str], + min_duration_seconds: int, +) -> list[dict[str, Any]]: + """Paginate through search results for one (query, channel) pair. Returns new video dicts.""" + collected: list[dict[str, Any]] = [] + page_token: Optional[str] = None + while True: + response = _fetch_search_page( + youtube, query_text, ch_id, after_str, before_str, page_token + ) + if response is None: + break + + new_ids = [ + item["id"]["videoId"] + for item in response.get("items", []) + if item.get("id", {}).get("kind") == "youtube#video" + and item["id"]["videoId"] not in seen_ids + ] + + for vdata in _fetch_video_details(youtube, new_ids) if new_ids else []: + vid = vdata.get("id", "") + if not vid or vid in seen_ids: + continue + duration = _parse_duration_iso( + vdata.get("contentDetails", {}).get("duration", "PT0S") + ) + if min_duration_seconds and duration < min_duration_seconds: + continue + seen_ids.add(vid) + collected.append(_format_video_data(vdata, search_term=query_text)) + + page_token = response.get("nextPageToken") + if not page_token: + break + return collected + + +def fetch_videos( + published_after: datetime, + published_before: datetime, + channel_title: Optional[str] = None, + skip_video_ids: Optional[set[str]] = None, + min_duration_seconds: int = 0, +) -> list[dict[str, Any]]: + """Fetch video metadata from the YouTube Data API v3. + + Args: + published_after: Fetch videos published after this time. + published_before: Fetch videos published before this time. + channel_title: If given, restrict to that channel (key in C_PLUS_PLUS_CHANNELS + or fallback keyword search). + skip_video_ids: Video IDs already in DB (skipped). + min_duration_seconds: Skip videos shorter than this. + + Returns: + List of normalised video metadata dicts. + """ + try: + from googleapiclient.discovery import build + except ImportError as exc: + raise ImportError( + "google-api-python-client is required: pip install google-api-python-client" + ) from exc + + youtube = build("youtube", "v3", developerKey=_get_api_key()) + after_str = _to_rfc3339(published_after) + before_str = _to_rfc3339(published_before) + seen_ids: set[str] = set(skip_video_ids or set()) + all_videos: list[dict[str, Any]] = [] + query_pairs = _build_queries(channel_title) + max_queries = _get_max_query_pairs() + if len(query_pairs) > max_queries: + logger.warning( + "fetch_videos: query list truncated from %d to %d by YOUTUBE_MAX_QUERY_PAIRS", + len(query_pairs), + max_queries, + ) + query_pairs = query_pairs[:max_queries] + + for idx, (query_text, ch_id) in enumerate(query_pairs, start=1): + try: + all_videos.extend( + _process_one_channel_query( + youtube, + query_text, + ch_id, + after_str, + before_str, + seen_ids, + min_duration_seconds, + ) + ) + except QuotaExceededError: + logger.error( + "fetch_videos: quota exhausted at query %d/%d (%r). " + "Returning partial results collected so far.", + idx, + len(query_pairs), + query_text, + ) + break + + logger.info("fetch_videos: fetched %d videos", len(all_videos)) + return all_videos diff --git a/cppa_youtube_script_tracker/management/__init__.py b/cppa_youtube_script_tracker/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cppa_youtube_script_tracker/management/commands/__init__.py b/cppa_youtube_script_tracker/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py new file mode 100644 index 00000000..7e3cf4d3 --- /dev/null +++ b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py @@ -0,0 +1,546 @@ +""" +Management command: run_cppa_youtube_script_tracker + +4-phase pipeline: + Phase 1: Process existing metadata queue JSONs → persist to DB → + move JSON to raw/metadata/ (permanent archive). + Phase 2: Determine start_time, fetch video metadata from YouTube Data API v3, + write to metadata queue (short-lived), persist to DB, + move JSON to raw/metadata/ (permanent archive). + Phase 3: Download VTT transcripts via yt-dlp for videos with has_transcript=False; + save directly to raw/transcripts/ (never deleted). + Phase 4: Pinecone upsert via run_cppa_pinecone_sync. +""" + +from __future__ import annotations + +import json +import logging +import os +import shutil +from datetime import datetime, timezone +from typing import Optional + +from django.conf import settings +from django.core.exceptions import ValidationError +from django.core.management import call_command +from django.core.management.base import BaseCommand +from django.utils.dateparse import parse_datetime + +from cppa_user_tracker.services import get_or_create_youtube_speaker +from cppa_youtube_script_tracker.fetcher import fetch_videos +from cppa_youtube_script_tracker.models import YouTubeVideo +from cppa_youtube_script_tracker.preprocessor import preprocess_youtube_for_pinecone +from cppa_youtube_script_tracker.services import ( + get_or_create_channel, + get_or_create_tag, + get_or_create_video, + link_speaker_to_video, + link_tag_to_video, + remove_speaker_links_by_name, + update_video_transcript, +) +from cppa_youtube_script_tracker.transcript import download_vtt +from cppa_youtube_script_tracker.utils import ( + UNKNOWN_SPEAKER_NAME, + build_speaker_external_id, + clean_text, + resolve_speakers, +) +from cppa_youtube_script_tracker.workspace import ( + get_metadata_queue_path, + get_raw_metadata_path, + get_raw_transcripts_dir, + iter_metadata_queue_jsons, +) + +logger = logging.getLogger(__name__) + +PINECONE_NAMESPACE_ENV_KEY = "YOUTUBE_PINECONE_NAMESPACE" +_DEFAULT_PINECONE_NAMESPACE = "youtube-scripts" + +YOUTUBE_COOKIES_FILE = os.getenv("YOUTUBE_COOKIES_FILE", "youtube_cookies.txt") + + +def _move_to_raw(video_id: str, queue_path) -> None: + """Move a metadata JSON from queue to raw/metadata/ (permanent archive).""" + try: + raw_path = get_raw_metadata_path(video_id) + raw_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(queue_path), str(raw_path)) + except Exception: + logger.warning( + "_move_to_raw: could not move %s to raw/metadata/, leaving in queue", + queue_path, + ) + return + + +def _persist_video(video_data: dict) -> tuple[bool, bool]: + """Persist one video metadata dict to DB. Returns (created, skipped).""" + video_id = clean_text(video_data.get("video_id", "")) + if not video_id: + return False, True + + channel_id = clean_text(video_data.get("channel_id", "")) + channel_title = clean_text(video_data.get("channel_title", "")) + channel = get_or_create_channel(channel_id, channel_title) if channel_id else None + + metadata = _build_video_metadata(video_data) + + try: + video, created = get_or_create_video( + video_id=video_id, channel=channel, metadata_dict=metadata + ) + except (ValueError, ValidationError) as e: + logger.warning( + "_persist_video: validation error for video_id=%s: %s", video_id, e + ) + return False, True + + speaker_names = _resolve_video_speakers(video_data, channel_title) + _link_speakers(video, speaker_names, channel_id=channel_id, video_id=video_id) + _link_tags(video, video_data.get("tags") or [], video_id=video_id) + + return created, False + + +def _build_video_metadata(video_data: dict) -> dict: + return { + "title": clean_text(video_data.get("title", "")), + "description": clean_text(video_data.get("description", "")), + "published_at": video_data.get("published_at"), + "duration_seconds": video_data.get("duration_seconds", 0), + "view_count": video_data.get("view_count"), + "like_count": video_data.get("like_count"), + "comment_count": video_data.get("comment_count"), + "search_term": clean_text(video_data.get("search_term", "")), + "scraped_at": video_data.get("scraped_at"), + } + + +def _link_speakers( + video: YouTubeVideo, + speaker_names: list[str], + *, + channel_id: str, + video_id: str, +) -> None: + for name in speaker_names: + speaker, _ = get_or_create_youtube_speaker( + external_id=build_speaker_external_id( + speaker_name=name, + channel_id=channel_id, + video_id=video_id, + ), + display_name=name, + ) + link_speaker_to_video(video, speaker) + + +def _link_tags(video: YouTubeVideo, raw_tags: list[str], *, video_id: str) -> None: + for raw_tag in raw_tags: + tag_name = clean_text(raw_tag) + if not tag_name: + continue + tag = get_or_create_tag(tag_name) + link_tag_to_video(video, tag) + + +def _resolve_video_speakers(video_data: dict, channel_title: str) -> list[str]: + return resolve_speakers( + title=clean_text(video_data.get("title", "")), + description=clean_text(video_data.get("description", "")), + channel_title=channel_title, + ) + + +def _process_queue() -> tuple[int, int]: + """Phase 1: load each metadata queue JSON, persist to DB, move to raw/metadata/. + + Returns (files_processed, videos_skipped). + """ + processed = 0 + skipped = 0 + for path in iter_metadata_queue_jsons(): + try: + data = json.loads(path.read_text(encoding="utf-8")) + items = data if isinstance(data, list) else [data] + persist_ok = True + last_video_id = "" + for item in items: + try: + _, was_skipped = _persist_video(item) + last_video_id = item.get("video_id", "") + if was_skipped: + skipped += 1 + except Exception: + persist_ok = False + logger.exception( + "_process_queue: persist failed for video_id=%s in %s", + item.get("video_id", "?"), + path, + ) + skipped += 1 + if persist_ok: + _move_to_raw(last_video_id or path.stem, path) + processed += 1 + except Exception: + logger.exception("_process_queue: failed to read %s", path) + return processed, skipped + + +def _get_start_time_from_db() -> Optional[datetime]: + """Return the latest published_at from YouTubeVideo, or None if table is empty.""" + latest = YouTubeVideo.objects.order_by("-published_at").first() + return latest.published_at if latest and latest.published_at else None + + +def _resolve_start_time(start_time_arg: str, dry_run: bool) -> datetime: + """Resolve the start_time for Phase 2 fetch. + + Priority: CLI arg → latest DB record → YOUTUBE_DEFAULT_PUBLISHED_AFTER → 2015-01-01. + """ + if start_time_arg: + dt = parse_datetime(start_time_arg) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + if not dry_run: + db_dt = _get_start_time_from_db() + if db_dt: + logger.info( + "run_cppa_youtube_script_tracker: using start_time from DB: %s", db_dt + ) + return db_dt + + default_after = ( + getattr(settings, "YOUTUBE_DEFAULT_PUBLISHED_AFTER", None) or "" + ).strip() + if default_after: + dt = parse_datetime(default_after) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + fallback = datetime(2015, 1, 1, tzinfo=timezone.utc) + logger.warning( + "run_cppa_youtube_script_tracker: no start_time available; defaulting to %s", + fallback, + ) + return fallback + + +def _resolve_end_time(end_time_arg: str) -> datetime: + """Parse end_time CLI arg or default to now().""" + if end_time_arg: + dt = parse_datetime(end_time_arg) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + return datetime.now(tz=timezone.utc) + + +def _persist_fetched_video(vdata: dict) -> tuple[bool, bool]: + """Write video to metadata queue/, persist to DB, move to raw/metadata/. Returns (created, skipped).""" + vid = vdata.get("video_id", "") + if not vid: + return False, True + + queue_path = get_metadata_queue_path(vid) + queue_path.parent.mkdir(parents=True, exist_ok=True) + queue_path.write_text(json.dumps(vdata, indent=2, default=str), encoding="utf-8") + + try: + was_created, was_skipped = _persist_video(vdata) + _move_to_raw(vid, queue_path) + return was_created, was_skipped + except Exception: + logger.exception( + "run_cppa_youtube_script_tracker: Phase 2 persist failed for video_id=%s", + vid, + ) + return False, True + + +def _read_text_file(path: str) -> str: + try: + with open(path, "r", encoding="utf-8") as file_obj: + return file_obj.read() + except Exception: + return "" + + +def _enrich_speakers_from_transcript( + video_obj: YouTubeVideo, transcript_path: str +) -> None: + """Try transcript-based speaker extraction and replace unknown fallback if possible.""" + transcript_text = _read_text_file(transcript_path) + if not transcript_text: + return + + resolved = resolve_speakers( + title=clean_text(video_obj.title), + description=clean_text(video_obj.description), + channel_title=( + clean_text(video_obj.channel.channel_title) if video_obj.channel else "" + ), + transcript_text=transcript_text, + ) + if not resolved: + return + + # If we discovered a concrete speaker name, remove fallback "unkown" links first. + has_known = any( + name.casefold() != UNKNOWN_SPEAKER_NAME.casefold() for name in resolved + ) + if has_known: + remove_speaker_links_by_name(video_obj, UNKNOWN_SPEAKER_NAME) + + for name in resolved: + try: + speaker, _ = get_or_create_youtube_speaker( + external_id=build_speaker_external_id( + speaker_name=name, + channel_id=( + clean_text(video_obj.channel.channel_id) + if video_obj.channel + else "" + ), + video_id=video_obj.video_id, + ), + display_name=name, + ) + link_speaker_to_video(video_obj, speaker) + except Exception: + logger.warning( + "_enrich_speakers_from_transcript: could not link speaker %r to video %s", + name, + video_obj.video_id, + ) + + +def _run_phase_2( + start_time: datetime, + end_time: datetime, + channel_title: str, +) -> tuple[int, int]: + """Fetch new videos and persist them. Returns (created_count, skipped_count).""" + existing_ids: set[str] = set( + YouTubeVideo.objects.values_list("video_id", flat=True) + ) + videos = fetch_videos( + published_after=start_time, + published_before=end_time, + channel_title=channel_title or None, + skip_video_ids=existing_ids, + ) + created_count = 0 + skipped_count = 0 + for vdata in videos: + was_created, was_skipped = _persist_fetched_video(vdata) + if was_created: + created_count += 1 + elif was_skipped: + skipped_count += 1 + return created_count, skipped_count + + +def _run_phase_3() -> tuple[int, int]: + """Download VTT transcripts for videos that don't have one yet. + + Saves directly to raw/transcripts/ (never deleted). + Returns (ok_count, fail_count). + """ + pending = list(YouTubeVideo.objects.filter(has_transcript=False)) + transcripts_dir = get_raw_transcripts_dir() + ok = 0 + fail = 0 + for video_obj in pending: + vid = video_obj.video_id + try: + vtt_path = download_vtt( + vid, output_dir=transcripts_dir, cookies_file=YOUTUBE_COOKIES_FILE + ) + if vtt_path: + video_obj = YouTubeVideo.objects.get(video_id=vid) + update_video_transcript(video_obj, str(vtt_path)) + _enrich_speakers_from_transcript(video_obj, str(vtt_path)) + ok += 1 + else: + fail += 1 + except Exception: + fail += 1 + logger.exception( + "run_cppa_youtube_script_tracker: transcript download failed for %s", + vid, + ) + return ok, fail + + +def _run_pinecone_sync(app_id: str, namespace: str) -> None: + """Trigger run_cppa_pinecone_sync if app_id and namespace are set.""" + if not app_id: + logger.warning("Pinecone sync skipped: --pinecone-app-id is empty.") + return + if not namespace: + logger.warning( + "Pinecone sync skipped: namespace is empty (set --pinecone-namespace or %s).", + PINECONE_NAMESPACE_ENV_KEY, + ) + return + try: + call_command( + "run_cppa_pinecone_sync", + app_id=app_id, + namespace=namespace, + preprocess_fn=preprocess_youtube_for_pinecone, + ) + logger.info( + "run_cppa_youtube_script_tracker: Pinecone sync complete (app_id=%s, namespace=%s)", + app_id, + namespace, + ) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning( + "Pinecone sync skipped/failed (run_cppa_pinecone_sync unavailable or errored): %s", + exc, + ) + + +class Command(BaseCommand): + help = ( + "Fetch YouTube C++ video metadata and transcripts, persist to DB, " + "then optionally upsert to Pinecone. " + "Processes existing metadata queue JSONs first, then fetches from the YouTube Data API." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--start-time", + type=str, + default="", + help=( + "ISO datetime string; fetch videos published after this time. " + "Default: latest published_at in DB (after Phase 1), " + "or YOUTUBE_DEFAULT_PUBLISHED_AFTER env var if DB is empty." + ), + ) + parser.add_argument( + "--end-time", + type=str, + default="", + help="ISO datetime string; fetch videos published before this time. Default: now().", + ) + parser.add_argument( + "--channel-title", + type=str, + default="", + help=( + "Restrict scraping to a specific channel title " + "(must match a key in fetcher.C_PLUS_PLUS_CHANNELS or search by name)." + ), + ) + parser.add_argument( + "--dry-run", action="store_true", help="Skip DB writes and API calls." + ) + parser.add_argument( + "--skip-transcript", action="store_true", help="Skip Phase 3." + ) + parser.add_argument( + "--pinecone-app-id", + type=str, + default="youtube", + help="App ID passed to run_cppa_pinecone_sync.", + ) + parser.add_argument( + "--pinecone-namespace", + type=str, + default=os.getenv(PINECONE_NAMESPACE_ENV_KEY, _DEFAULT_PINECONE_NAMESPACE), + help=f"Pinecone namespace. Default from env {PINECONE_NAMESPACE_ENV_KEY}.", + ) + + def handle(self, *args, **options): + start_time_arg = (options.get("start_time") or "").strip() + end_time_arg = (options.get("end_time") or "").strip() + channel_title = (options.get("channel_title") or "").strip() + dry_run: bool = options["dry_run"] + skip_transcript: bool = options["skip_transcript"] + pinecone_app_id = (options.get("pinecone_app_id") or "").strip() + pinecone_namespace = (options.get("pinecone_namespace") or "").strip() + + logger.info( + "run_cppa_youtube_script_tracker: starting " + "(start_time=%s, end_time=%s, channel_title=%s, dry_run=%s, skip_transcript=%s)", + start_time_arg or "auto", + end_time_arg or "now", + channel_title or "all", + dry_run, + skip_transcript, + ) + + try: + self._phase_1(dry_run) + start_time = _resolve_start_time(start_time_arg, dry_run) + end_time = _resolve_end_time(end_time_arg) + + self.stdout.write( + f"Phase 2: fetching videos {start_time.isoformat()} → {end_time.isoformat()} …" + ) + + if dry_run: + self.stdout.write( + self.style.SUCCESS( + f"Dry run: would fetch from {start_time.isoformat()} to " + f"{end_time.isoformat()}. No API calls or DB writes." + ) + ) + return + + self._phase_2(start_time, end_time, channel_title) + self._phase_3(skip_transcript) + _run_pinecone_sync(app_id=pinecone_app_id, namespace=pinecone_namespace) + + except Exception: + logger.exception("run_cppa_youtube_script_tracker: unhandled error") + raise + + def _phase_1(self, dry_run: bool) -> None: + if dry_run: + return + files_processed, videos_skipped = _process_queue() + self.stdout.write( + f"Phase 1: processed {files_processed} queue file(s); {videos_skipped} video(s) skipped." + ) + logger.info( + "run_cppa_youtube_script_tracker: Phase 1 done; queue_files=%d, skipped=%d", + files_processed, + videos_skipped, + ) + + def _phase_2( + self, start_time: datetime, end_time: datetime, channel_title: str + ) -> None: + created_count, skipped_count = _run_phase_2(start_time, end_time, channel_title) + if created_count == 0 and skipped_count == 0: + self.stdout.write(self.style.WARNING("Phase 2: no new videos fetched.")) + logger.info("run_cppa_youtube_script_tracker: Phase 2 — no new videos") + else: + self.stdout.write( + self.style.SUCCESS( + f"Phase 2 done: {created_count} created, {skipped_count} skipped." + ) + ) + logger.info( + "run_cppa_youtube_script_tracker: Phase 2 done; created=%d, skipped=%d", + created_count, + skipped_count, + ) + + def _phase_3(self, skip_transcript: bool) -> None: + if skip_transcript: + self.stdout.write("Phase 3: skipped (--skip-transcript).") + return + ok, fail = _run_phase_3() + self.stdout.write(f"Phase 3 done: {ok} downloaded, {fail} unavailable.") + logger.info( + "run_cppa_youtube_script_tracker: Phase 3 done; ok=%d, fail=%d", ok, fail + ) diff --git a/cppa_youtube_script_tracker/migrations/0001_initial.py b/cppa_youtube_script_tracker/migrations/0001_initial.py new file mode 100644 index 00000000..843b0b30 --- /dev/null +++ b/cppa_youtube_script_tracker/migrations/0001_initial.py @@ -0,0 +1,176 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("cppa_user_tracker", "0005_youtubespeaker_alter_baseprofile_type"), + ] + + operations = [ + migrations.CreateModel( + name="YouTubeChannel", + fields=[ + ( + "channel_id", + models.CharField(max_length=64, primary_key=True, serialize=False), + ), + ("channel_title", models.CharField(blank=True, max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "verbose_name": "YouTube channel", + "verbose_name_plural": "YouTube channels", + "ordering": ["channel_title"], + }, + ), + migrations.CreateModel( + name="CppaTags", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("tag_name", models.CharField(db_index=True, max_length=128, unique=True)), + ], + options={ + "verbose_name": "CPPA tag", + "verbose_name_plural": "CPPA tags", + "ordering": ["tag_name"], + }, + ), + migrations.CreateModel( + name="YouTubeVideo", + fields=[ + ( + "video_id", + models.CharField(max_length=32, primary_key=True, serialize=False), + ), + ( + "channel", + models.ForeignKey( + blank=True, + db_column="channel_id", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="videos", + to="cppa_youtube_script_tracker.youtubechannel", + ), + ), + ("title", models.CharField(blank=True, max_length=512)), + ("description", models.TextField(blank=True)), + ("published_at", models.DateTimeField(blank=True, db_index=True, null=True)), + ("duration_seconds", models.IntegerField(default=0)), + ("view_count", models.IntegerField(blank=True, null=True)), + ("like_count", models.IntegerField(blank=True, null=True)), + ("comment_count", models.IntegerField(blank=True, null=True)), + ("search_term", models.CharField(blank=True, max_length=255)), + ("has_transcript", models.BooleanField(default=False)), + ("transcript_path", models.CharField(blank=True, max_length=1024)), + ("scraped_at", models.DateTimeField(blank=True, null=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "verbose_name": "YouTube video", + "verbose_name_plural": "YouTube videos", + "ordering": ["-published_at"], + }, + ), + migrations.CreateModel( + name="YouTubeVideoSpeaker", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "video", + models.ForeignKey( + db_column="video_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_speakers", + to="cppa_youtube_script_tracker.youtubevideo", + ), + ), + ( + "speaker", + models.ForeignKey( + db_column="speaker_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_appearances", + to="cppa_user_tracker.youtubespeaker", + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ], + options={ + "verbose_name": "YouTube video speaker", + "verbose_name_plural": "YouTube video speakers", + "ordering": ["video", "speaker"], + }, + ), + migrations.AddConstraint( + model_name="youtubevideospeaker", + constraint=models.UniqueConstraint( + fields=["video", "speaker"], name="unique_video_speaker" + ), + ), + migrations.CreateModel( + name="YouTubeVideoTags", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "youtube_video", + models.ForeignKey( + db_column="youtube_video_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_tags", + to="cppa_youtube_script_tracker.youtubevideo", + ), + ), + ( + "cppa_tag", + models.ForeignKey( + db_column="cppa_tag_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="tagged_videos", + to="cppa_youtube_script_tracker.cppatags", + ), + ), + ], + options={ + "verbose_name": "YouTube video tag", + "verbose_name_plural": "YouTube video tags", + "ordering": ["youtube_video", "cppa_tag"], + }, + ), + migrations.AddConstraint( + model_name="youtubevideotags", + constraint=models.UniqueConstraint( + fields=["youtube_video", "cppa_tag"], name="unique_video_tag" + ), + ), + ] diff --git a/cppa_youtube_script_tracker/migrations/__init__.py b/cppa_youtube_script_tracker/migrations/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/cppa_youtube_script_tracker/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/models.py b/cppa_youtube_script_tracker/models.py new file mode 100644 index 00000000..d852ad9f --- /dev/null +++ b/cppa_youtube_script_tracker/models.py @@ -0,0 +1,145 @@ +""" +Models per docs/Schema.md — cppa_youtube_script_tracker section. + +Tables: +- YouTubeChannel: publisher channel (e.g. CppCon, C++Now); channel_id is PK +- YouTubeVideo: individual video metadata + transcript state; video_id is PK +- YouTubeVideoSpeaker: M2M join between YouTubeVideo and cppa_user_tracker.YoutubeSpeaker +- CppaTags: C++ community tag vocabulary +- YouTubeVideoTags: M2M join between YouTubeVideo and CppaTags +""" + +from django.db import models + + +class YouTubeChannel(models.Model): + """Publishing channel a video was uploaded to (e.g. CppCon, C++Now). + + channel_id is the YouTube channel ID and serves as the primary key. + """ + + channel_id = models.CharField(max_length=64, primary_key=True) + channel_title = models.CharField(max_length=255, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["channel_title"] + verbose_name = "YouTube channel" + verbose_name_plural = "YouTube channels" + + def __str__(self) -> str: + return self.channel_title or self.channel_id + + +class YouTubeVideo(models.Model): + """YouTube video metadata and transcript download state. + + video_id is the YouTube video ID and serves as the primary key. + """ + + video_id = models.CharField(max_length=32, primary_key=True) + channel = models.ForeignKey( + YouTubeChannel, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="videos", + db_column="channel_id", + ) + title = models.CharField(max_length=512, blank=True) + description = models.TextField(blank=True) + published_at = models.DateTimeField(null=True, blank=True, db_index=True) + duration_seconds = models.IntegerField(default=0) + view_count = models.IntegerField(null=True, blank=True) + like_count = models.IntegerField(null=True, blank=True) + comment_count = models.IntegerField(null=True, blank=True) + search_term = models.CharField(max_length=255, blank=True) + has_transcript = models.BooleanField(default=False) + transcript_path = models.CharField(max_length=1024, blank=True) + scraped_at = models.DateTimeField(null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-published_at"] + verbose_name = "YouTube video" + verbose_name_plural = "YouTube videos" + + def __str__(self) -> str: + return self.title or self.video_id + + +class YouTubeVideoSpeaker(models.Model): + """M2M join: links a YouTubeVideo to a YoutubeSpeaker profile.""" + + video = models.ForeignKey( + YouTubeVideo, + on_delete=models.CASCADE, + related_name="video_speakers", + db_column="video_id", + ) + speaker = models.ForeignKey( + "cppa_user_tracker.YoutubeSpeaker", + on_delete=models.CASCADE, + related_name="video_appearances", + db_column="speaker_id", + ) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["video", "speaker"], name="unique_video_speaker" + ) + ] + ordering = ["video", "speaker"] + verbose_name = "YouTube video speaker" + verbose_name_plural = "YouTube video speakers" + + def __str__(self) -> str: + return f"video={self.video_id} speaker={self.speaker_id}" # type: ignore[attr-defined] + + +class CppaTags(models.Model): + """C++ community tag vocabulary (e.g. 'concurrency', 'templates', 'modules').""" + + tag_name = models.CharField(max_length=128, unique=True, db_index=True) + + class Meta: + ordering = ["tag_name"] + verbose_name = "CPPA tag" + verbose_name_plural = "CPPA tags" + + def __str__(self) -> str: + return self.tag_name + + +class YouTubeVideoTags(models.Model): + """M2M join: links a YouTubeVideo to a CppaTags entry.""" + + youtube_video = models.ForeignKey( + YouTubeVideo, + on_delete=models.CASCADE, + related_name="video_tags", + db_column="youtube_video_id", + ) + cppa_tag = models.ForeignKey( + CppaTags, + on_delete=models.CASCADE, + related_name="tagged_videos", + db_column="cppa_tag_id", + ) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["youtube_video", "cppa_tag"], name="unique_video_tag" + ) + ] + ordering = ["youtube_video", "cppa_tag"] + verbose_name = "YouTube video tag" + verbose_name_plural = "YouTube video tags" + + def __str__(self) -> str: + return f"video={self.youtube_video_id} tag={self.cppa_tag_id}" # type: ignore[attr-defined] diff --git a/cppa_youtube_script_tracker/preprocessor.py b/cppa_youtube_script_tracker/preprocessor.py new file mode 100644 index 00000000..36e8454c --- /dev/null +++ b/cppa_youtube_script_tracker/preprocessor.py @@ -0,0 +1,175 @@ +""" +Pinecone preprocess function for cppa_youtube_script_tracker. + +Guideline source: docs/Pinecone_preprocess_guideline_c.md + +Returns whole-document payloads (is_chunked=False) so the sync pipeline can +apply its configured chunking strategy. +""" + +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Any + +from django.db.models import Q + +from .models import YouTubeVideo + + +def _normalize_failed_ids(failed_ids: list[str]) -> list[str]: + """Return stripped, non-empty, de-duplicated failed IDs preserving order.""" + seen: set[str] = set() + out: list[str] = [] + for raw in failed_ids: + value = (raw or "").strip() + if not value or value in seen: + continue + seen.add(value) + out.append(value) + return out + + +def _read_vtt(transcript_path: str) -> str: + """Return plain text from a .vtt file, stripping VTT header/timestamps. + + Returns empty string if the file does not exist or cannot be read. + """ + path = Path(transcript_path) + if not path.exists(): + return "" + try: + raw = path.read_text(encoding="utf-8", errors="replace") + except OSError: + return "" + + lines: list[str] = [] + for line in raw.splitlines(): + line = line.strip() + # Skip WEBVTT header, NOTE blocks, blank lines, and timestamp lines + if not line: + continue + if line.startswith("WEBVTT") or line.startswith("NOTE"): + continue + # Timestamp lines: "00:00:00.000 --> 00:00:05.000" or similar + if "-->" in line: + continue + # Cue-setting lines (e.g. "align:start position:0%") + if line.startswith("align:") or line.startswith("position:"): + continue + lines.append(line) + + return " ".join(lines).strip() + + +def _get_speaker_names(video: YouTubeVideo) -> list[str]: + """Return a sorted list of speaker display_names linked to this video.""" + names = list( + video.video_speakers.select_related("speaker") + .values_list("speaker__display_name", flat=True) + .order_by("speaker__display_name") + ) + return [n for n in names if n] + + +def _build_document_content(video: YouTubeVideo, speaker_names: list[str]) -> str: + """Build plain-text content for embedding.""" + parts: list[str] = [] + + if video.title: + parts.append(f"Title: {video.title.strip()}") + if speaker_names: + parts.append(f"Speakers: {', '.join(speaker_names)}") + if video.channel and video.channel.channel_title: + parts.append(f"Channel: {video.channel.channel_title.strip()}") + if video.published_at: + parts.append(f"Published: {video.published_at.isoformat()}") + + description = (video.description or "").strip() + if description: + parts.append(f"\nDescription:\n{description}") + + if video.has_transcript and video.transcript_path: + transcript_text = _read_vtt(video.transcript_path) + if transcript_text: + parts.append(f"\nTranscript:\n{transcript_text}") + + return "\n".join(parts).strip() + + +def _build_candidate_queryset( + normalized_failed: list[str], final_sync_at: datetime | None +): + """Return the ORM queryset of candidates to preprocess.""" + queryset = YouTubeVideo.objects.select_related("channel").prefetch_related( + "video_speakers__speaker" + ) + if final_sync_at is None and not normalized_failed: + return queryset.order_by("video_id") + criteria = Q() + if final_sync_at is not None: + criteria |= Q(updated_at__gt=final_sync_at) + if normalized_failed: + criteria |= Q(video_id__in=normalized_failed) + return queryset.filter(criteria).order_by("video_id") + + +def _build_video_metadata( + video: YouTubeVideo, speaker_names: list[str] +) -> dict[str, Any]: + """Build the Pinecone metadata dict for one video.""" + channel_title = (video.channel.channel_title if video.channel else "") or "" + return { + "doc_id": f"youtube-{video.video_id}", + "ids": str(video.pk), + "type": "youtube", + "url": f"https://www.youtube.com/watch?v={video.video_id}", + "title": video.title or "", + "author": ", ".join(speaker_names), + "channel": channel_title, + "timestamp": int(video.published_at.timestamp()) if video.published_at else 0, + "has_transcript": video.has_transcript, + } + + +def preprocess_youtube_for_pinecone( + failed_ids: list[str], + final_sync_at: datetime | None, +) -> tuple[list[dict[str, Any]], bool]: + """Build Pinecone sync documents for YouTube videos. + + Args: + failed_ids: Previous-run failed source IDs (video_id values). + final_sync_at: Last sync timestamp for incremental sync; None means first sync. + + Returns: + (documents, is_chunked) + - documents: list[{"content": str, "metadata": dict}] + - is_chunked: False (whole docs; pipeline may chunk later) + """ + normalized_failed = _normalize_failed_ids(failed_ids or []) + candidates = _build_candidate_queryset(normalized_failed, final_sync_at) + + docs: list[dict[str, Any]] = [] + seen_video_ids: set[str] = set() + + for video in candidates: + vid = (video.video_id or "").strip() + if not vid or vid in seen_video_ids: + continue + seen_video_ids.add(vid) + + speaker_names = _get_speaker_names(video) + content = _build_document_content(video, speaker_names) + if not content: + continue + + docs.append( + { + "content": content, + "metadata": _build_video_metadata(video, speaker_names), + } + ) + + return docs, False diff --git a/cppa_youtube_script_tracker/services.py b/cppa_youtube_script_tracker/services.py new file mode 100644 index 00000000..7ec7877e --- /dev/null +++ b/cppa_youtube_script_tracker/services.py @@ -0,0 +1,158 @@ +""" +Service layer for cppa_youtube_script_tracker. + +All creates/updates/deletes for this app's models must go through functions in this +module. Do not call Model.objects.create(), model.save(), or model.delete() from +outside this module. + +See docs/Contributing.md for the project-wide rule. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from .models import ( + CppaTags, + YouTubeChannel, + YouTubeVideo, + YouTubeVideoSpeaker, + YouTubeVideoTags, +) + + +def _parse_dt_field(value: Any) -> Any: + """Parse a datetime string field; returns datetime, None, or the original value.""" + if isinstance(value, str) and value: + from django.utils.dateparse import parse_datetime as _pd + + return _pd(value) + return value + + +def get_or_create_channel( + channel_id: str, + channel_title: str = "", +) -> YouTubeChannel: + """Get or create a YouTubeChannel by channel_id (PK). + + If the channel exists and channel_title differs, the title is updated. + Returns the YouTubeChannel instance. + """ + channel_id_val = (channel_id or "").strip() + if not channel_id_val: + raise ValueError("channel_id must not be empty.") + channel_title_val = (channel_title or "").strip() + channel, created = YouTubeChannel.objects.get_or_create( + channel_id=channel_id_val, + defaults={"channel_title": channel_title_val}, + ) + if not created and channel_title_val and channel.channel_title != channel_title_val: + channel.channel_title = channel_title_val + channel.save(update_fields=["channel_title", "updated_at"]) + return channel + + +def get_or_create_video( + video_id: str, + channel: Optional[YouTubeChannel], + metadata_dict: dict[str, Any], +) -> tuple[YouTubeVideo, bool]: + """Get or create a YouTubeVideo by video_id (PK). Returns (video, created). + + metadata_dict keys (all optional): + title, description, published_at (datetime or ISO str), duration_seconds, + view_count, like_count, comment_count, search_term, scraped_at. + + Raises ValueError if video_id is empty. + """ + video_id_val = (video_id or "").strip() + if not video_id_val: + raise ValueError("video_id must not be empty.") + + published_at = _parse_dt_field(metadata_dict.get("published_at")) + scraped_at = _parse_dt_field(metadata_dict.get("scraped_at")) + + defaults: dict[str, Any] = { + "channel": channel, + "title": (metadata_dict.get("title") or ""), + "description": (metadata_dict.get("description") or ""), + "published_at": published_at, + "duration_seconds": int(metadata_dict.get("duration_seconds") or 0), + "view_count": metadata_dict.get("view_count"), + "like_count": metadata_dict.get("like_count"), + "comment_count": metadata_dict.get("comment_count"), + "search_term": (metadata_dict.get("search_term") or ""), + "scraped_at": scraped_at, + } + video, created = YouTubeVideo.objects.get_or_create( + video_id=video_id_val, + defaults=defaults, + ) + return video, created + + +def update_video_transcript( + video: YouTubeVideo, + transcript_path: str, +) -> YouTubeVideo: + """Mark video as having a transcript and store its path. Returns the updated video.""" + video.has_transcript = True + video.transcript_path = (transcript_path or "").strip() + video.save(update_fields=["has_transcript", "transcript_path", "updated_at"]) + return video + + +def link_speaker_to_video( + video: YouTubeVideo, + speaker: Any, +) -> YouTubeVideoSpeaker: + """Link a YoutubeSpeaker to a YouTubeVideo (get-or-create). Returns YouTubeVideoSpeaker.""" + join, _ = YouTubeVideoSpeaker.objects.get_or_create( + video=video, + speaker=speaker, + ) + return join + + +def remove_speaker_links_by_name( + video: YouTubeVideo, + speaker_name: str, +) -> int: + """Remove all speaker links for a video where speaker.display_name matches speaker_name. + + Returns number of deleted join rows. + """ + speaker_name_val = (speaker_name or "").strip() + if not speaker_name_val: + return 0 + deleted, _ = YouTubeVideoSpeaker.objects.filter( + video=video, + speaker__display_name=speaker_name_val, + ).delete() + return int(deleted) + + +def get_or_create_tag(tag_name: str) -> CppaTags: + """Get or create a CppaTags entry by tag_name. + + Raises ValueError if tag_name is empty. + Returns the CppaTags instance. + """ + tag_name_val = (tag_name or "").strip().lower() + if not tag_name_val: + raise ValueError("tag_name must not be empty.") + tag, _ = CppaTags.objects.get_or_create(tag_name=tag_name_val) + return tag + + +def link_tag_to_video( + video: YouTubeVideo, + tag: CppaTags, +) -> YouTubeVideoTags: + """Link a CppaTags entry to a YouTubeVideo (get-or-create). Returns YouTubeVideoTags.""" + join, _ = YouTubeVideoTags.objects.get_or_create( + youtube_video=video, + cppa_tag=tag, + ) + return join diff --git a/cppa_youtube_script_tracker/tests/__init__.py b/cppa_youtube_script_tracker/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cppa_youtube_script_tracker/transcript.py b/cppa_youtube_script_tracker/transcript.py new file mode 100644 index 00000000..585fdf47 --- /dev/null +++ b/cppa_youtube_script_tracker/transcript.py @@ -0,0 +1,87 @@ +""" +VTT transcript downloader for cppa_youtube_script_tracker. + +Adapted from cppa-brain-backend/copilot_data/scrape/youtube_cpp/scraper.py +(YouTubeCppScraper._content_download / _setup_ytdlp). +Uses yt-dlp to download auto-generated or manual English subtitles as .vtt files. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +_YDL_OPTS_BASE: dict = { + "skip_download": True, + "force_ipv4": True, + "writesubtitles": True, + "writeautomaticsub": True, + "subtitleslangs": ["en"], + "subtitlesformat": "vtt", + "quiet": False, + "no_warnings": False, + "ignore_no_formats_error": True, + "extractor_args": { + "youtube": ["player_client=tv,web_safari"], + }, +} + + +def download_vtt( + video_id: str, + output_dir: Path, + cookies_file: Optional[str] = None, +) -> Optional[Path]: + """Download the English VTT transcript for video_id into output_dir. + + Tries manual captions first, then auto-generated. Returns the Path to the + downloaded .vtt file on success, or None if no transcript was found. + + Args: + video_id: YouTube video ID (11 characters). + output_dir: Directory where the .vtt file will be written. + cookies_file: Optional path to a cookies.txt for authenticated requests. + + Returns: + Path to the downloaded file (e.g. output_dir/{video_id}.en.vtt), or None. + """ + try: + import yt_dlp + except ImportError as exc: + raise ImportError("yt-dlp is required: pip install yt-dlp") from exc + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + url = f"https://www.youtube.com/watch?v={video_id}" + outtmpl = str(output_dir / "%(id)s.%(ext)s") + + ydl_opts = dict(_YDL_OPTS_BASE) + ydl_opts["outtmpl"] = outtmpl + if cookies_file: + ydl_opts["cookiefile"] = cookies_file + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.error("download_vtt: yt-dlp error for %s: %s", video_id, exc) + return None + + # yt-dlp writes {video_id}.{lang}.vtt; try most common pattern first + expected = output_dir / f"{video_id}.en.vtt" + if expected.exists(): + logger.debug("download_vtt: found %s", expected) + return expected + + # Fallback: look for any .vtt file matching the video_id + matches = list(output_dir.glob(f"{video_id}*.vtt")) + if matches: + logger.debug("download_vtt: found %s (fallback glob)", matches[0]) + return matches[0] + + logger.info("download_vtt: no VTT transcript found for %s", video_id) + return None diff --git a/cppa_youtube_script_tracker/utils.py b/cppa_youtube_script_tracker/utils.py new file mode 100644 index 00000000..0d6c28e4 --- /dev/null +++ b/cppa_youtube_script_tracker/utils.py @@ -0,0 +1,201 @@ +""" +Speaker extraction utilities for cppa_youtube_script_tracker. + +Priority order: +1) description patterns +2) title pattern +3) transcript introduction patterns +4) fallback to "unknown" +""" + +from __future__ import annotations + +import re +from typing import Iterable + +UNKNOWN_SPEAKER_NAME = "unknown" + +_SEPARATORS = (" - ", " — ", " | ") +_INTRO_RE = re.compile( + r"(?i)\b(?:i am|my name is)\s+([A-Z][A-Za-z'`-]*(?:\s+[A-Z][A-Za-z'`-]*){0,4})" +) + + +def clean_text(value: object) -> str: + if value is None: + return "" + return str(value).replace("\x00", "").replace("\u2019", "'").strip() + + +def _slugify_speaker_name(name: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "_", clean_text(name).lower()).strip("_") + return slug or "unknown" + + +def build_speaker_external_id( + speaker_name: str, + channel_id: str = "", + video_id: str = "", +) -> str: + """Build a stable speaker external identifier from channel/video context.""" + slug = _slugify_speaker_name(speaker_name) + channel_id = clean_text(channel_id) + video_id = clean_text(video_id) + if channel_id: + return f"youtube:channel:{channel_id}:speaker:{slug}" + if video_id: + return f"youtube:video:{video_id}:speaker:{slug}" + return f"youtube:name:{slug}" + + +def _normalize_name(name: str) -> str: + name = re.sub(r"\s+", " ", clean_text(name)) + name = name.strip(" .,:;\"'`-") + return name + + +def _dedupe_keep_order(values: Iterable[str]) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for value in values: + key = value.casefold() + if value and key not in seen: + seen.add(key) + out.append(value) + return out + + +def _extract_speaker_colon_line(description: str) -> list[str]: + # Example: "Speaker: Ehsan Amiri" + matches = re.findall(r"(?im)^\s*speaker\s*:\s*(.+?)\s*$", description or "") + return [_normalize_name(m) for m in matches if _normalize_name(m)] + + +def _extract_middle_name_from_triplet( + text: str, title: str = "", channel_title: str = "" +) -> str: + """ + Try parsing structures like: + {title} - {speaker} - {channel} + """ + text_norm = clean_text(text) + if not text_norm: + return "" + + for sep in _SEPARATORS: + if sep not in text_norm: + continue + parts = [_normalize_name(p) for p in text_norm.split(sep)] + parts = [p for p in parts if p] + if len(parts) < 3: + continue + candidate = parts[-2] + last = parts[-1].casefold() + first = parts[0].casefold() + title_cf = clean_text(title).casefold() + channel_cf = clean_text(channel_title).casefold() + + # Prefer high-confidence: title/speaker/channel match pattern. + if channel_cf and channel_cf in last: + return candidate + if title_cf and title_cf in first: + return candidate + + return "" + + +def _extract_from_intro_pattern(text: str) -> list[str]: + matches = _INTRO_RE.findall(text or "") + return [_normalize_name(m) for m in matches if _normalize_name(m)] + + +def extract_speakers_from_description( + description: str, title: str = "", channel_title: str = "" +) -> list[str]: + """ + Description-based speaker extraction: + - line starting with "Speaker:" + - 4th non-empty line pattern: {title} - {speaker} - {channel} + - intro pattern: "I am ..." / "my name is ..." + """ + description = clean_text(description) + if not description: + return [] + + speakers: list[str] = [] + speakers.extend(_extract_speaker_colon_line(description)) + + non_empty_lines = [ln.strip() for ln in description.splitlines() if ln.strip()] + if len(non_empty_lines) >= 4: + candidate = _extract_middle_name_from_triplet( + non_empty_lines[3], title=title, channel_title=channel_title + ) + if candidate: + speakers.append(candidate) + + for line in non_empty_lines: + candidate = _extract_middle_name_from_triplet( + line, title=title, channel_title=channel_title + ) + if candidate: + speakers.append(candidate) + + speakers.extend(_extract_from_intro_pattern(description)) + return _dedupe_keep_order(speakers) + + +def extract_speakers_from_title(title: str, channel_title: str = "") -> list[str]: + """ + Title-based extraction for structures: + {title} - {speaker} - {channel} + """ + title = clean_text(title) + if not title: + return [] + + candidate = _extract_middle_name_from_triplet( + title, title=title, channel_title=channel_title + ) + if candidate: + return [candidate] + return [] + + +def extract_speakers_from_transcript_text(transcript_text: str) -> list[str]: + """ + Transcript fallback extraction using introduction patterns. + We prioritize early transcript content where introductions usually appear. + """ + transcript_text = clean_text(transcript_text) + if not transcript_text: + return [] + early_text = transcript_text[:8000] + return _dedupe_keep_order(_extract_from_intro_pattern(early_text)) + + +def resolve_speakers( + *, + title: str, + description: str, + channel_title: str = "", + transcript_text: str = "", +) -> list[str]: + """ + Resolve speakers using priority: + description -> title -> transcript -> ["unknown"] + """ + from_description = extract_speakers_from_description( + description=description, title=title, channel_title=channel_title + ) + if from_description: + return from_description + + from_title = extract_speakers_from_title(title=title, channel_title=channel_title) + if from_title: + return from_title + + from_transcript = extract_speakers_from_transcript_text(transcript_text) + if from_transcript: + return from_transcript + + return [UNKNOWN_SPEAKER_NAME] diff --git a/cppa_youtube_script_tracker/workspace.py b/cppa_youtube_script_tracker/workspace.py new file mode 100644 index 00000000..19345ba8 --- /dev/null +++ b/cppa_youtube_script_tracker/workspace.py @@ -0,0 +1,85 @@ +""" +Workspace paths for cppa_youtube_script_tracker. + +Layout: +- Metadata queue: workspace/cppa_youtube_script_tracker/metadata/{video_id}.json + (short-lived; moved to raw after DB persist) +- Raw metadata: workspace/raw/cppa_youtube_script_tracker/metadata/{video_id}.json + (permanent archive; never deleted) +- Raw transcripts: workspace/raw/cppa_youtube_script_tracker/transcripts/{video_id}.en.vtt + (permanent archive; never deleted) +""" + +from pathlib import Path + +from config.workspace import get_workspace_path + +_APP_SLUG = "cppa_youtube_script_tracker" +_RAW_APP_SLUG = f"raw/{_APP_SLUG}" + + +def get_workspace_root() -> Path: + """Return this app's workspace directory (workspace/cppa_youtube_script_tracker/).""" + return get_workspace_path(_APP_SLUG) + + +def get_raw_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/; creates if missing.""" + path = get_workspace_path(_RAW_APP_SLUG) + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_metadata_queue_dir() -> Path: + """Return workspace/cppa_youtube_script_tracker/metadata/; creates if missing. + + JSON files here are short-lived: moved to raw/metadata/ after DB persist. + """ + path = get_workspace_root() / "metadata" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_raw_metadata_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/metadata/; creates if missing. + + Permanent archive: JSON files are never deleted after being moved here. + """ + path = get_raw_dir() / "metadata" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_raw_transcripts_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/transcripts/; creates if missing. + + Permanent archive: VTT files are never deleted. + """ + path = get_raw_dir() / "transcripts" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_metadata_queue_path(video_id: str) -> Path: + """Return workspace/cppa_youtube_script_tracker/metadata/{video_id}.json.""" + return get_metadata_queue_dir() / f"{video_id}.json" + + +def get_raw_metadata_path(video_id: str) -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/metadata/{video_id}.json.""" + return get_raw_metadata_dir() / f"{video_id}.json" + + +def get_transcript_path(video_id: str, lang: str = "en") -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/transcripts/{video_id}.{lang}.vtt.""" + return get_raw_transcripts_dir() / f"{video_id}.{lang}.vtt" + + +def iter_metadata_queue_jsons(): + """Yield Path for each *.json file in the metadata queue directory.""" + queue_dir = get_workspace_root() / "metadata" + if not queue_dir.is_dir(): + return + for path in sorted(queue_dir.glob("*.json")): + if not path.name.startswith("."): + yield path diff --git a/docker-compose.yml b/docker-compose.yml index ddc897a1..eff59c64 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -61,8 +61,8 @@ services: ALLOWED_HOSTS: ${ALLOWED_HOSTS:-localhost,127.0.0.1,web,0.0.0.0} SELENIUM_HUB_URL: http://selenium:4444/wd/hub volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs - ./staticfiles:/app/staticfiles depends_on: # db: { condition: service_healthy } @@ -80,8 +80,8 @@ services: CELERY_RESULT_BACKEND: redis://redis:6379/0 SELENIUM_HUB_URL: http://selenium:4444/wd/hub volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs command: celery -A config worker -l info depends_on: # db: { condition: service_healthy } @@ -98,8 +98,8 @@ services: CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs - celerybeat_data:/app/celerybeat command: celery -A config beat -l info depends_on: @@ -109,6 +109,6 @@ services: volumes: postgres_data: - workspace_data: - logs_data: + # workspace_data: + # logs_data: celerybeat_data: diff --git a/docs/Pinecone_preprocess_guideline.md b/docs/Pinecone_preprocess_guideline.md index ac049024..ad43ec43 100644 --- a/docs/Pinecone_preprocess_guideline.md +++ b/docs/Pinecone_preprocess_guideline.md @@ -71,7 +71,7 @@ Each item in the list must be a dict with at least: | `content` | top-level | Yes | The text to index (plain string). | | `metadata` | top-level | Yes | Dict of metadata attached to the document. | | `metadata["doc_id"]` or `metadata["url"]` | inside `metadata` | One required | Stable identifier for the document (e.g. primary key, URL). Used for chunk IDs and for skipping invalid docs. | -| `metadata["ids"]` | inside `metadata` | Recommended | Comma-separated **source record IDs** (e.g. DB primary keys). Used to record failed IDs when an upsert fails so they can be retried next run. If omitted, failed-document tracking for that item will be empty. | +| `metadata["source_ids"]` | inside `metadata` | Recommended | Comma-separated **source record IDs** (e.g. DB primary keys). Used to record failed IDs when an upsert fails so they can be retried next run. If omitted, failed-document tracking for that item will be empty. | Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed through to Pinecone and can be used for filtering or display. @@ -82,7 +82,7 @@ Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed throu "content": "The actual text to index for this document or chunk.", "metadata": { "doc_id": "slack-msg-12345", # or "url": "https://..." - "ids": "12345", # source ID(s) for retry tracking + "source_ids": "12345", # source ID(s) for retry tracking "title": "Optional title", }, } @@ -90,12 +90,12 @@ Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed throu ### Example with multiple source IDs (e.g. one chunk from multiple rows) -If one logical “document” is built from several source records, pass their IDs in `metadata["ids"]` as a comma-separated string so that if the upsert fails, all of them are recorded for retry: +If one logical “document” is built from several source records, pass their IDs in `metadata["source_ids"]` as a comma-separated string so that if the upsert fails, all of them are recorded for retry: ```python "metadata": { "doc_id": "thread-abc", - "ids": "101,102,103", + "source_ids": "101,102,103", } ``` @@ -145,21 +145,33 @@ python manage.py run_cppa_pinecone_sync \ --pinecone-instance private ``` -| Instance | Django setting read | `.env` key | -|------------|-----------------------------|-----------------------------| -| `public` | `PINECONE_API_KEY` | `PINECONE_API_KEY` | -| `private` | `PINECONE_PRIVATE_API_KEY` | `PINECONE_PRIVATE_API_KEY` | +| Instance | Django setting read | `.env` key | +| --------- | -------------------------- | -------------------------- | +| `public` | `PINECONE_API_KEY` | `PINECONE_API_KEY` | +| `private` | `PINECONE_PRIVATE_API_KEY` | `PINECONE_PRIVATE_API_KEY` | If no `instance` is specified, **public** is used. --- +## Clang GitHub Tracker (`clang_github_tracker`) + +For **llvm/llvm-project** issues and PRs, `clang_github_tracker.preprocessors.issue_preprocessor` and `pr_preprocessor` **do not** scan all raw JSON files. They: + +1. Select candidate **numbers** from the DB: `ClangGithubIssueItem` rows where `updated_at > final_sync_at` (or **all** rows if `final_sync_at` is `None`), filtered by `is_pull_request`. +2. Union **retry** numbers parsed from `failed_ids` strings (e.g. `…:issue:123`, `…:pr:456`). +3. For each number, read the corresponding raw file under `workspace/raw/github_activity_tracker/...` and build the document with `github_activity_tracker.preprocessors.github_preprocess.build_issue_document` / `build_pr_document`. + +The **`cppa_pinecone_sync`** contract (`preprocess_fn(failed_ids, final_sync_at)`, fail list, sync status) is unchanged; only the clang preprocessors’ **selection** strategy differs from the Boost path. + +--- + ## Summary checklist - [ ] Signature: `(failed_ids: list[str], final_sync_at: datetime | None) -> tuple[list[dict], bool]`. - [ ] Each dict has top-level `content` (str) and `metadata` (dict). - [ ] Each `metadata` has at least one of `doc_id` or `url`. -- [ ] For retry tracking, set `metadata["ids"]` to the source record ID(s), comma-separated if multiple. +- [ ] For retry tracking, set `metadata["source_ids"]` to the source record ID(s), comma-separated if multiple. - [ ] Use `failed_ids` to re-include previously failed records. - [ ] Use `final_sync_at` for incremental sync when applicable. - [ ] Return `is_chunked=True` only if you are already emitting final chunks; otherwise `False`. diff --git a/docs/Schema.md b/docs/Schema.md index 56f258d3..5b012f62 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -20,6 +20,7 @@ erDiagram BaseProfile ||--o| SlackUser : "extends" BaseProfile ||--o| MailingListProfile : "extends" BaseProfile ||--o| WG21PaperAuthorProfile : "extends" + BaseProfile ||--o| YoutubeSpeaker : "extends" Identity }o--|| BaseProfile : "has" TempProfileIdentityRelation ||--o{ BaseProfile : "has" TmpIdentity ||--o{ TempProfileIdentityRelation : "has" @@ -67,6 +68,13 @@ erDiagram } WG21PaperAuthorProfile { + string display_name "IX" + string author_alias "IX" + datetime created_at + datetime updated_at + } + + YoutubeSpeaker { string display_name "IX" datetime created_at datetime updated_at @@ -101,7 +109,7 @@ erDiagram **Note:** The **Email** table references BaseProfile via `base_profile_id` (FK to `BaseProfile.id`). One profile can have multiple email addresses; `is_primary` marks the primary email; `is_active` indicates whether the email is currently active. Other tables (e.g. MailingListMessage) can link to a profile via Email. **Note:** The `email` field is **not unique**; the same email address may appear in multiple rows (e.g. for different profiles or over time). -**Note:** The `type` field is a PostgreSQL enum (or equivalent) with values: `github`, `slack`, `mailing_list`, `wg21`. It identifies which extended table the row belongs to. +**Note:** The `type` field is a PostgreSQL enum (or equivalent) with values: `github`, `slack`, `mailing_list`, `wg21`, `discord`, `youtube`. It identifies which extended table the row belongs to. **Note:** In **GitHubAccount**, the `type` field is an enum with values: `user`, `organization`, `enterprise` (identifies whether the GitHub account is a user, organization, or enterprise). @@ -348,6 +356,19 @@ erDiagram --- +### 2b. Clang GitHub Tracker (`clang_github_tracker`) + +Standalone tables for the **llvm/llvm-project** (or `CLANG_GITHUB_OWNER` / `CLANG_GITHUB_REPO`) mirror. **No foreign keys** to other apps. + +| Model | Purpose | +| ----- | ------- | +| **ClangGithubIssueItem** | One row per issue or PR **number** (`unique`). `is_pull_request` distinguishes types. `github_created_at` / `github_updated_at` mirror GitHub API times; **`github_updated_at`** (with `Max` + 1ms) drives **API fetch** resume. Django **`updated_at`** (`auto_now`) bumps on every upsert and drives **Pinecone** incrementality vs `PineconeSyncStatus.final_sync_at`. | +| **ClangGithubCommit** | One row per **sha** (`unique`, 40-char hex). `github_committed_at` is the author/committer date used for commit fetch watermarks. | + +Raw JSON remains under `workspace/raw/github_activity_tracker///` (same layout as other raw GitHub activity). + +--- + ### 3. Boost Library Tracker #### Part 1: Boost Library, Headers, and Dependencies @@ -613,21 +634,35 @@ erDiagram erDiagram Direction LR WG21PaperAuthorProfile ||--o{ WG21PaperAuthor : "author" + WG21Mailing ||--o{ WG21Paper : "has" WG21PaperAuthor }o--|| WG21Paper : "has" WG21PaperAuthor { int id PK int paper_id FK int profile_id FK + int author_order + datetime created_at + } + + WG21Mailing { + int id PK + string mailing_date UK "IX" + string title datetime created_at + datetime updated_at } WG21Paper { int id PK - string paper_id UK "IX" + string paper_id "IX" + int year "IX" string url string title "IX" - date publication_date "IX" + date document_date "IX" + int mailing_id FK "IX" + string subgroup "IX" + boolean is_downloaded "IX" datetime created_at datetime updated_at } @@ -635,7 +670,11 @@ erDiagram **Note:** **WG21PaperAuthorProfile** extends `BaseProfile` (section 1). `profile_id` in WG21PaperAuthor references this profile; each paper can have multiple authors. -**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. +**Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing. + +**Note:** **WG21Paper** is uniquely identified by the composite `(paper_id, year)`; `paper_id` is not globally unique. The same paper identifier may appear in different years (e.g. revisions). + +**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. `author_order` is optional and 1-based; it indicates the order of authors on the paper. --- @@ -731,101 +770,206 @@ erDiagram --- +### 10. CPPA YouTube Script Tracker + +Stores YouTube video metadata, VTT transcripts, speaker links, and community tags for C++ conference talks (CppCon, C++Now, Meeting C++, etc.). + +- **`YouTubeChannel`** — publisher channel; `channel_id` is the primary key. +- **`YouTubeVideo`** — video metadata and transcript state; `video_id` is the primary key. +- **`YouTubeVideoSpeaker`** — M2M join between `YouTubeVideo` and `cppa_user_tracker.YoutubeSpeaker`. +- **`CppaTags`** — C++ community tag vocabulary (e.g. `concurrency`, `templates`, `modules`). +- **`YouTubeVideoTags`** — M2M join between `YouTubeVideo` and `CppaTags`. + +**Workspace layout:** + +``` +workspace/ +├── cppa_youtube_script_tracker/ +│ └── metadata/{video_id}.json # short-lived queue; moved to raw after DB persist +└── raw/ + └── cppa_youtube_script_tracker/ + ├── metadata/{video_id}.json # permanent archive + └── transcripts/{video_id}.en.vtt # permanent archive +``` + +```mermaid +erDiagram + direction TB + YoutubeSpeaker ||--o{ YouTubeVideoSpeaker : "appears_in" + YouTubeVideo ||--o{ YouTubeVideoSpeaker : "has" + YouTubeChannel ||--o{ YouTubeVideo : "hosts" + YouTubeVideo ||--o{ YouTubeVideoTags : "has" + CppaTags ||--o{ YouTubeVideoTags : "tagged_in" + + YouTubeChannel { + string channel_id PK + string channel_title + datetime created_at + datetime updated_at + } + + YouTubeVideo { + string video_id PK + string channel_id FK + string title + text description + datetime published_at "IX" + int duration_seconds + int view_count + int like_count + int comment_count + string search_term + bool has_transcript + string transcript_path + datetime scraped_at + datetime created_at + datetime updated_at + } + + YouTubeVideoSpeaker { + int id PK + string video_id FK + int speaker_id FK + datetime created_at + } + + CppaTags { + int id PK + string tag_name "UK IX" + } + + YouTubeVideoTags { + int id PK + string youtube_video_id FK + int cppa_tag_id FK + } + + YoutubeSpeaker { + int baseprofile_ptr_id PK + string display_name "IX" + } +``` + +**Note:** `YoutubeSpeaker` is defined in `cppa_user_tracker` (section 1) and extends `BaseProfile`. It is identified solely by `display_name` (same pattern as `MailingListProfile` and `WG21PaperAuthorProfile`). + +**Note:** `YouTubeVideoSpeaker` has a unique constraint on `(video, speaker)`. + +**Note:** `YouTubeVideoTags` has a unique constraint on `(youtube_video, cppa_tag)`. `CppaTags.tag_name` values are stored lowercase. + +--- + ## Appendix ### Appendix A: Table summary -| Table | Description | Section | -| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| **BaseProfile** | Base table for profiles; extended by platform-specific profile tables. Has `identity_id` FK to Identity. | 1 | -| **Identity** | Top-level user/account; one identity can have multiple BaseProfiles. | 1 | -| **Email** | Email addresses linked to BaseProfile (one profile, many emails). | 1 | -| **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | -| **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | -| **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | -| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | -| **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | -| **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | -| **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | -| **GitHubFile** | File in a repo (filename, repo_id, is_deleted). Base for file subtypes. | 2 | -| **Language** | Reference: language name. | 2 | -| **License** | Reference: license name, spdx_id, url. | 2 | -| **RepoLanguage** | Repo-language link with line_count. | 2 | -| **RepoLicense** | Repo-license link. | 2 | -| **GitCommit** | Commit in a repo (hash, committer, comment, commit_at). | 2 | -| **GitCommitFileChange** | Per-commit file change (links commit, GitHubFile, status, additions, deletions, patch). | 2 | -| **Issue** | GitHub issue (repo, creator, number, title, body, state, labels, assignees). | 2 | -| **IssueComment** | Comment on an issue. | 2 | -| **IssueAssignee** | Issue-assignee link. | 2 | -| **IssueLabel** | Issue-label name. | 2 | -| **PullRequest** | PR (repo, creator, number, title, body, state, head_hash, base_hash, dates). | 2 | -| **PullRequestReview** | Review on a PR. | 2 | -| **PullRequestComment** | Comment on a PR. | 2 | -| **PullRequestAssignee** | PR-assignee link. | 2 | -| **PullRequestLabel** | PR-label name. | 2 | -| **BoostLibraryRepository** | Extends GitHubRepository; adds created_at, updated_at (Boost repos). | 3 | -| **BoostLibrary** | Library within a Boost repo (name). | 3 | -| **BoostFile** | Extends GitHubFile; adds library_id (file in a Boost library). | 3 | -| **BoostVersion** | Reference: Boost version string. | 3 | -| **BoostLibraryVersion** | Library-version link (cpp_version, description). | 3 | -| **BoostDependency** | Library dependency (client_library, version, dep_library). | 3 | -| **DependencyChangeLog** | Log of dependency add/remove (client_library, dep_library, is_add, created_at). | 3 | -| **BoostLibraryRoleRelationship** | Library version-account link (maintainer/author). | 3 | -| **BoostLibraryCategory** | Reference: category name. | 3 | -| **BoostLibraryCategoryRelationship** | Library-category link. | 3 | -| **BoostExternalRepository** | Extends GitHubRepository; adds boost_version, is_boost_embedded, is_boost_used. | 4 | -| **BoostUsage** | External repo use of Boost (repo, boost_header_id, file_path_id, last_commit_date). | 4 | -| **MailingListMessage** | Mailing list message (sender_id->MailingListProfile, msg_id, subject, content, list_name, sent_at). | 5 | -| **SlackTeam** | Slack workspace (team_id, team_name). | 6 | -| **SlackChannel** | Channel in a team (channel_id, name, type, creator_user_id). | 6 | -| **SlackMessage** | Message in a channel (ts, slack_user_id, message, thread_ts). | 6 | -| **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | -| **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | -| **WG21Paper** | WG21 paper (paper_id, url, title, publication_date). | 7 | -| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | -| **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | -| **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | -| **WebsiteWordCount** | Per-date, per-word count. | 8 | -| **PineconeFailList** | Failed sync records (failed_id, type) for retry/audit. | 9 | -| **PineconeSyncStatus** | Last sync per type (type, final_sync_at, created_at, updated_at); type = slack, mailing list, wg21, etc. | 9 | +| Table | Description | Section | +| ------------------------------------ | -------------------------------------------------------------------------------------------------------- | ------- | +| **BaseProfile** | Base table for profiles; extended by platform-specific profile tables. Has `identity_id` FK to Identity. | 1 | +| **Identity** | Top-level user/account; one identity can have multiple BaseProfiles. | 1 | +| **Email** | Email addresses linked to BaseProfile (one profile, many emails). | 1 | +| **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | +| **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | +| **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | +| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | +| **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | +| **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | +| **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | +| **GitHubFile** | File in a repo (filename, repo_id, is_deleted). Base for file subtypes. | 2 | +| **Language** | Reference: language name. | 2 | +| **CreatedReposByLanguage** | Yearly repository counts by language (`all_repos`, `significant_repos`; unique on `language_id + year`). | 2 | +| **License** | Reference: license name, spdx_id, url. | 2 | +| **RepoLanguage** | Repo-language link with line_count. | 2 | +| **RepoLicense** | Repo-license link. | 2 | +| **GitCommit** | Commit in a repo (hash, committer, comment, commit_at). | 2 | +| **GitCommitFileChange** | Per-commit file change (links commit, GitHubFile, status, additions, deletions, patch). | 2 | +| **Issue** | GitHub issue (repo, creator, number, title, body, state, labels, assignees). | 2 | +| **IssueComment** | Comment on an issue. | 2 | +| **IssueAssignee** | Issue-assignee link. | 2 | +| **IssueLabel** | Issue-label name. | 2 | +| **PullRequest** | PR (repo, creator, number, title, body, state, head_hash, base_hash, dates). | 2 | +| **PullRequestReview** | Review on a PR. | 2 | +| **PullRequestComment** | Comment on a PR. | 2 | +| **PullRequestAssignee** | PR-assignee link. | 2 | +| **PullRequestLabel** | PR-label name. | 2 | +| **ClangGithubIssueItem** | Clang mirror: one row per issue/PR number (no FKs); GitHub timestamps + Django `updated_at` for Pinecone incrementality. | 2b | +| **ClangGithubCommit** | Clang mirror: one row per commit SHA (no FKs); `github_committed_at` for fetch watermark. | 2b | +| **BoostLibraryRepository** | Extends GitHubRepository; adds created_at, updated_at (Boost repos). | 3 | +| **BoostLibrary** | Library within a Boost repo (name). | 3 | +| **BoostFile** | Extends GitHubFile; adds library_id (file in a Boost library). | 3 | +| **BoostVersion** | Reference: Boost version string. | 3 | +| **BoostLibraryVersion** | Library-version link (cpp_version, description). | 3 | +| **BoostDependency** | Library dependency (client_library, version, dep_library). | 3 | +| **DependencyChangeLog** | Log of dependency add/remove (client_library, dep_library, is_add, created_at). | 3 | +| **BoostLibraryRoleRelationship** | Library version-account link (maintainer/author). | 3 | +| **BoostLibraryCategory** | Reference: category name. | 3 | +| **BoostLibraryCategoryRelationship** | Library-category link. | 3 | +| **BoostExternalRepository** | Extends GitHubRepository; adds boost_version, is_boost_embedded, is_boost_used. | 4 | +| **BoostUsage** | External repo use of Boost (repo, boost_header_id, file_path_id, last_commit_date). | 4 | +| **BoostMissingHeaderTmp** | Temporary usage records when header_name is not yet in BoostFile/GitHubFile (usage_id→BoostUsage.id). | 4 | +| **MailingListMessage** | Mailing list message (sender_id->MailingListProfile, msg_id, subject, content, list_name, sent_at). | 5 | +| **SlackTeam** | Slack workspace (team_id, team_name). | 6 | +| **SlackChannel** | Channel in a team (channel_id, name, type, creator_user_id). | 6 | +| **SlackMessage** | Message in a channel (ts, slack_user_id, message, thread_ts). | 6 | +| **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | +| **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | +| **WG21Paper** | WG21 paper (paper_id, url, title, publication_date). | 7 | +| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | +| **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | +| **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | +| **WebsiteWordCount** | Per-date, per-word count. | 8 | +| **PineconeFailList** | Failed sync records (failed_id, type) for retry/audit. | 9 | +| **PineconeSyncStatus** | Last sync per type (type, final_sync_at, created_at, updated_at); type = slack, mailing list, wg21, etc. | 9 | +| **YoutubeSpeaker** | Profile for YouTube speakers; extends BaseProfile. Identified by `display_name`. | 1, 10 | +| **YouTubeChannel** | Publisher channel; `channel_id` is PK (no auto-increment id). | 10 | +| **YouTubeVideo** | Video metadata, transcript state, and channel FK; `video_id` is PK (no auto-increment id). | 10 | +| **YouTubeVideoSpeaker** | M2M join between YouTubeVideo and YoutubeSpeaker (video_id, speaker_id). | 10 | +| **CppaTags** | C++ community tag vocabulary (tag_name, unique/lowercase). | 10 | +| **YouTubeVideoTags** | M2M join between YouTubeVideo and CppaTags (youtube_video_id, cppa_tag_id). | 10 | | **BoostDocContent** | Globally unique scraped page by content hash (url, content_hash UK, first_version_id, last_version_id, is_upserted, scraped_at). One row per unique content hash across all versions. | 10 | | **BoostLibraryDocumentation** | Join table: BoostLibraryVersion × BoostDocContent. Records which pages belong to each (library, version) pair. | 10 | ### Appendix B: Relationship summary -| From | To | Relationship | -| --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | -| Identity | BaseProfile | One identity has many profiles | -| BaseProfile | Email | One profile has many emails | -| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile | Extends (1:1 subtype) | -| TmpIdentity | TempProfileIdentityRelation | Has many (target) | -| TempProfileIdentityRelation | BaseProfile | Has many (base_profile_id) | -| GitHubAccount | GitHubRepository | Owns many | -| GitHubRepository | RepoLanguage, RepoLicense | Has many | -| GitHubRepository | BoostLibraryRepository, BoostExternalRepository | Extends (1:1 subtype) | -| GitHubRepository | GitCommit, Issue, PullRequest | Contains many | -| GitHubRepository | GitHubFile | Has many | -| GitHubFile | BoostFile | Extends (1:1 subtype) | -| GitHubFile | GitCommitFileChange | Changed in (many file changes) | -| GitCommit | GitCommitFileChange | Has many | -| Issue | IssueComment, IssueAssignee, IssueLabel | Has many | -| PullRequest | PullRequestReview, PullRequestComment, PullRequestAssignee, PullRequestLabel | Has many | -| GitHubAccount | GitCommit, Issue, IssueComment, IssueAssignee, PullRequest, PullRequestReview, PullRequestComment, PullRequestAssignee | Committer/creator/author/assignee/reviewer | -| BoostLibraryRepository | BoostLibrary | Has many | -| BoostLibrary | BoostFile, BoostDependency (client/dep), BoostLibraryVersion, DependencyChangeLog | Has many | -| BoostLibrary | BoostLibraryCategoryRelationship | Has many | -| BoostVersion | BoostDependency, BoostLibraryVersion, BoostDocContent (first/last) | Version / first+last observed | -| BoostLibraryVersion | BoostLibraryRoleRelationship | Has many | -| GitHubAccount | BoostLibraryRoleRelationship | Role (maintainer/author) | -| BoostLibraryCategory | BoostLibraryCategoryRelationship | Category | -| BoostExternalRepository | BoostUsage | Has many | -| BoostUsage | BoostFile, GitHubFile | References (boost header, file path) | -| MailingListProfile | MailingListMessage | Sender (has many messages) | -| SlackTeam | SlackChannel | Has many | -| SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | -| SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | -| SlackChannel | SlackUser | Creator (many-to-one) | -| WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | -| WG21Paper | WG21PaperAuthor | Has many authors | +| From | To | Relationship | +| --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------- | +| Identity | BaseProfile | One identity has many profiles | +| BaseProfile | Email | One profile has many emails | +| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile | Extends (1:1 subtype) | +| TmpIdentity | TempProfileIdentityRelation | Has many (target) | +| TempProfileIdentityRelation | BaseProfile | Has many (base_profile_id) | +| GitHubAccount | GitHubRepository | Owns many | +| GitHubRepository | RepoLanguage, RepoLicense | Has many | +| Language | CreatedReposByLanguage | Has many yearly stats | +| GitHubRepository | BoostLibraryRepository, BoostExternalRepository | Extends (1:1 subtype) | +| GitHubRepository | GitCommit, Issue, PullRequest | Contains many | +| GitHubRepository | GitHubFile | Has many | +| GitHubFile | BoostFile | Extends (1:1 subtype) | +| GitHubFile | GitCommitFileChange | Changed in (many file changes) | +| GitCommit | GitCommitFileChange | Has many | +| Issue | IssueComment, IssueAssignee, IssueLabel | Has many | +| PullRequest | PullRequestReview, PullRequestComment, PullRequestAssignee, PullRequestLabel | Has many | +| GitHubAccount | GitCommit, Issue, IssueComment, IssueAssignee, PullRequest, PullRequestReview, PullRequestComment, PullRequestAssignee | Committer/creator/author/assignee/reviewer | +| BoostLibraryRepository | BoostLibrary | Has many | +| BoostLibrary | BoostFile, BoostDependency (client/dep), BoostLibraryVersion, DependencyChangeLog | Has many | +| BoostLibrary | BoostLibraryCategoryRelationship | Has many | +| BoostVersion | BoostDependency, BoostLibraryVersion | Version | +| BoostLibraryVersion | BoostLibraryRoleRelationship | Has many | +| GitHubAccount | BoostLibraryRoleRelationship | Role (maintainer/author) | +| BoostLibraryCategory | BoostLibraryCategoryRelationship | Category | +| BoostExternalRepository | BoostUsage | Has many | +| BoostUsage | BoostFile, GitHubFile | References (boost header, file path) | +| BoostUsage | BoostMissingHeaderTmp | Has many (temporary missing-header records) | +| MailingListProfile | MailingListMessage | Sender (has many messages) | +| SlackTeam | SlackChannel | Has many | +| SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | +| SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | +| SlackChannel | SlackUser | Creator (many-to-one) | +| WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | +| WG21Paper | WG21PaperAuthor | Has many authors | +| YoutubeSpeaker | YouTubeVideoSpeaker | Appears in (many videos) | +| YouTubeChannel | YouTubeVideo | Hosts many videos | +| YouTubeVideo | YouTubeVideoSpeaker | Has many speakers | +| YouTubeVideo | YouTubeVideoTags | Has many tags | +| CppaTags | YouTubeVideoTags | Tagged in many videos | | BoostLibraryVersion | BoostLibraryDocumentation | Has many (boost_library_version_id) | | BoostDocContent | BoostLibraryDocumentation | Used in many (boost_doc_content_id) | diff --git a/docs/Workspace.md b/docs/Workspace.md index 76a030e4..21afe13d 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -20,8 +20,7 @@ workspace/ # WORKSPACE_DIR (configurable via │ │ └── prs/.json │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── /.json -├── clang_github_activity/ # State for clang_github_tracker (last sync dates) -│ └── state.json +├── clang_github_tracker/ # Markdown export for clang_github_tracker (md_export/) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── / │ └── messages/.json # Formatted cache (processed then removed) diff --git a/docs/operations/WG21_GitHub_Dispatch.md b/docs/operations/WG21_GitHub_Dispatch.md new file mode 100644 index 00000000..49046b01 --- /dev/null +++ b/docs/operations/WG21_GitHub_Dispatch.md @@ -0,0 +1,69 @@ +# WG21 Paper Tracker → GitHub Actions (`repository_dispatch`) + +The Django app **`run_wg21_paper_tracker`** scrapes WG21 mailings and stores paper metadata in the database. It does **not** download PDFs or other documents. When **new** paper rows are created in a run, it can send **one** [repository dispatch](https://docs.github.com/en/rest/repos/repos#create-a-repository-dispatch-event) to another GitHub repository so a workflow there fetches each URL and runs conversion (e.g. PDF → Markdown). + +## Environment variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `WG21_GITHUB_DISPATCH_ENABLED` | No (default `false`) | Set to `true` to send `repository_dispatch` when there are new papers. | +| `WG21_GITHUB_DISPATCH_REPO` | Yes, if enabled | Target repo as `owner/repo` (the repo whose workflow will run). | +| `WG21_GITHUB_DISPATCH_TOKEN` | Yes, if enabled | PAT or token with permission to create repository dispatch events on that repo (classic PAT: `repo` scope for private repos). | +| `WG21_GITHUB_DISPATCH_EVENT_TYPE` | No | Must match `on.repository_dispatch.types` in the target workflow. Default: `wg21_papers_convert`. | + +## `client_payload` contract + +The JSON body includes only a list of URL strings: + +```json +{ + "papers": [ + "https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/…", + "https://www.open-std.org/…" + ] +} +``` + +- **`papers`**: array of strings (WG21 document URLs), all new papers from **that** pipeline run in a **single** event. +- There is **no** `new_paper_count` field; use `length(papers)` in the workflow if needed. + +## Target repository workflow (example) + +```yaml +on: + repository_dispatch: + types: [wg21_papers_convert] + +jobs: + convert: + runs-on: ubuntu-latest + steps: + - name: URLs + run: | + echo '${{ toJson(github.event.client_payload.papers) }}' + # Fetch each URL, convert, store artifacts / upload elsewhere +``` + +In expressions, `github.event.client_payload.papers` is a JSON array of strings. + +## Token security + +Store `WG21_GITHUB_DISPATCH_TOKEN` in a secret manager or CI secret—never commit it. Prefer a fine-grained PAT scoped to the conversion repo if possible. + +## Payload size + +Very large mailings could produce many URLs in one payload. If you approach GitHub or runner limits, document a split strategy (multiple dispatches) as an edge case; the default is one dispatch per tracker run with the full list. + +## CLI options + +- **`--from-date YYYY-MM`**: Process mailings with `mailing_date >= YYYY-MM` (WG21 / CSV style). Backfills from that key onward when used alone. +- **`--to-date YYYY-MM`**: Upper bound: `mailing_date <= YYYY-MM`. With `--from-date`, the run uses the inclusive range `[from, to]`. Without `--from-date`, behavior stays incremental (only mailings **newer than** the latest `WG21Mailing` in the DB), but capped at `to`—useful to avoid pulling very new mailings in a controlled run. +- **`--dry-run`**: Log only; do not run the pipeline or send dispatch. + +## Flow summary + +1. Scheduler runs `run_wg21_paper_tracker` (optionally with `--from-date` / `--to-date`). +2. Pipeline fetches mailings, upserts `WG21Mailing` / `WG21Paper` (metadata only). +3. For each row **newly created** in that run, its document URL is collected. +4. If the list is non-empty and dispatch is enabled, the app POSTs once to `POST /repos/{owner}/{repo}/dispatches` with `event_type` and `client_payload: { "papers": [ ... ] }`. +5. The conversion repo’s workflow runs and downloads each URL. diff --git a/docs/service_api/README.md b/docs/service_api/README.md index d297f7c1..4a726830 100644 --- a/docs/service_api/README.md +++ b/docs/service_api/README.md @@ -13,6 +13,8 @@ Index of all app service modules. All writes to app models must go through the s | [cppa_pinecone_sync.services](cppa_pinecone_sync.md) | cppa_pinecone_sync | Pinecone fail list and sync status (failure tracking, last-sync bookkeeping). | | [boost_usage_tracker.services](boost_usage_tracker.md) | boost_usage_tracker | External repos, Boost usage, missing-header tmp. | | [discord_activity_tracker.services](discord_activity_tracker.md) | discord_activity_tracker | Servers, channels, messages, reactions (user profiles in cppa_user_tracker). | +| [cppa_youtube_script_tracker.services](cppa_youtube_script_tracker.md) | cppa_youtube_script_tracker | YouTube channels, videos, transcript state, and speaker links for C++ conference talks. | +| [clang_github_tracker.services](clang_github_tracker.md) | clang_github_tracker | Upsert llvm issue/PR/commit rows; DB watermarks for API fetch windows. | --- @@ -24,6 +26,8 @@ Index of all app service modules. All writes to app models must go through the s - **boost_library_docs_tracker** – Get-or-create BoostDocContent (by content_hash; holds url, first/last_version, is_upserted); link to BoostLibraryVersion via BoostLibraryDocumentation (join row only); Pinecone sync driven by BoostDocContent.is_upserted. - **boost_usage_tracker** – Get-or-create BoostExternalRepository, create/update BoostUsage, record missing headers (BoostMissingHeaderTmp). - **discord_activity_tracker** – Get-or-create DiscordServer, DiscordChannel; create/update DiscordMessage, DiscordReaction. Discord user profiles in cppa_user_tracker. +- **cppa_youtube_script_tracker** – Get-or-create YouTubeChannel, YouTubeVideo; update transcript state; link speakers to videos. Speaker profiles (`YoutubeSpeaker`) in cppa_user_tracker. - **cppa_pinecone_sync** – Get/clear/record failed IDs in PineconeFailList; get/update PineconeSyncStatus. +- **clang_github_tracker** – Upsert `ClangGithubIssueItem` / `ClangGithubCommit` during sync or backfill; read `Max(github_updated_at)` / `Max(github_committed_at)` for fetch cursors. See [Contributing.md](../Contributing.md) for the rule that all writes go through the service layer. diff --git a/docs/service_api/boost_library_docs_tracker.md b/docs/service_api/boost_library_docs_tracker.md index 6fadb7cb..7ad8f4b2 100644 --- a/docs/service_api/boost_library_docs_tracker.md +++ b/docs/service_api/boost_library_docs_tracker.md @@ -5,35 +5,35 @@ **Type notation:** `BoostDocContent` and `BoostLibraryDocumentation` are from `boost_library_docs_tracker.models`. `BoostLibraryVersion` is from `boost_library_tracker.models` (read-only cross-app reference). +**Pinecone upsert state** is stored on `BoostDocContent.is_upserted`, not on `BoostLibraryDocumentation` (the join table has only the two FKs plus `created_at`). + --- ## BoostDocContent -| Function | Parameter types | Return type | Notes | -|---|---|---|---| -| `get_or_create_doc_content` | `url: str`, `page_content: str`, `content_hash: str` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | +| Function | Parameter types | Return type | Notes | +| -------------------------------- | ------------------------------------------------------------------- | ----------------------------- | --------------------------------------------------------------------- | +| `get_or_create_doc_content` | `url: str`, `content_hash: str`, `version_id: int \| None = None` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | +| `set_doc_content_upserted` | `doc: BoostDocContent`, `value: bool` | `BoostDocContent` | Sets `is_upserted`. | +| `set_doc_content_upserted_by_ids`| `ids: list[int]`, `value: bool` | `int` | Bulk `UPDATE`; returns number of rows updated. | +| `get_unupserted_doc_contents` | — | `QuerySet[BoostDocContent]` | `is_upserted=False`; used for Pinecone sync worklists. | ### `get_or_create_doc_content` return values The second element is a `str` indicating what changed: -| `change_type` | Condition | Side effects | -|---|---|---| -| `"created"` | URL not in DB | Inserts row with `page_content`, `content_hash`, `scraped_at=now()`. | -| `"content_changed"` | URL exists; `content_hash` differs | Updates `page_content`, `content_hash`, `scraped_at=now()`. | -| `"unchanged"` | URL exists; `content_hash` same | Updates `scraped_at=now()` only. | +| `change_type` | Condition | Side effects | +| ------------- | ----------------------------- | ------------------------------------------------------------------------------- | +| `"created"` | `content_hash` not in DB | Inserts row with `url`, `content_hash`, `scraped_at=now()`, `is_upserted=False`. May set `first_version_id` / `last_version_id` when `version_id` is passed. | +| `"unchanged"` | `content_hash` already exists | Updates `scraped_at`, and may update `url` and version FKs; same hash identity. | --- ## BoostLibraryDocumentation -| Function | Parameter types | Return type | Notes | -|---|---|---|---| -| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int`, `page_count: int` | `tuple[BoostLibraryDocumentation, bool]` | Get or create a row for the (library_version, doc_content) pair. Sets `page_count`. If exists, updates `page_count` if changed. | -| `mark_relation_running` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="running"`, `updated_at=now()`. | -| `mark_relation_completed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="completed"`, `updated_at=now()`. | -| `mark_relation_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | -| `get_pending_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | Returns all rows for this library-version where `status != "completed"`. Empty queryset means the library-version is fully done (skip on restart). | -| `get_docs_pending_sync` | — | `QuerySet[BoostLibraryDocumentation]` | Returns all rows where `status in ("pending", "failed")`. Used by the Pinecone sync step. | -| `mark_doc_synced` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="synced"` (or equivalent completed sync state), `updated_at=now()`. | -| `mark_doc_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | +Join table: one row per `(boost_library_version, boost_doc_content)` pair. **No** `page_count`, status fields, or `updated_at` on the model. + +| Function | Parameter types | Return type | Notes | +| --------------------------------- | ---------------------------------------------------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int` | `tuple[BoostLibraryDocumentation, bool]` | `get_or_create` on the pair. Second value is `created`. | +| `get_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | All join rows for that library version. | diff --git a/docs/service_api/clang_github_tracker.md b/docs/service_api/clang_github_tracker.md new file mode 100644 index 00000000..3e53ff51 --- /dev/null +++ b/docs/service_api/clang_github_tracker.md @@ -0,0 +1,35 @@ +# clang_github_tracker.services + +**Module path:** `clang_github_tracker.services` +**Description:** Upserts for `ClangGithubIssueItem` and `ClangGithubCommit` (no FKs to other apps). Used by `sync_clang_github_activity`, `backfill_clang_github_tracker`, and date resolution watermarks. + +**Type notation:** Models live in `clang_github_tracker.models`. + +--- + +## Upserts + +| Function | Parameters | Return | Raises | +| -------- | ---------- | ------ | ------ | +| `upsert_issue_item` | `number: int`, `*, is_pull_request: bool`, `github_created_at`, `github_updated_at` | `tuple[ClangGithubIssueItem, bool]` (instance, created) | — | +| `upsert_commit` | `sha: str`, `*, github_committed_at` | `tuple[ClangGithubCommit, bool]` | `ValueError` if `sha` is not 40 hex chars | + +--- + +## API fetch watermarks + +| Function | Return | Notes | +| -------- | ------ | ----- | +| `get_issue_item_watermark` | `datetime \| None` | `Max(github_updated_at)` over all issue/PR rows (unified issues+PR stream). | +| `get_commit_watermark` | `datetime \| None` | `Max(github_committed_at)` over commits. | +| `start_after_watermark` | `datetime \| None` | `max_dt + timedelta(milliseconds=1)` or `None` if `max_dt` is `None`. | + +Used by `clang_github_tracker.state_manager.resolve_start_end_dates` (with optional CLI `--since` / `--until` bounds). + +--- + +## Related docs + +- [Schema.md](../Schema.md) – Section 2b: Clang GitHub Tracker. +- [Workspace.md](../Workspace.md) – `workspace/raw/github_activity_tracker/`, `workspace/clang_github_tracker/`. +- [Contributing.md](../Contributing.md) – Service layer rule. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index f638501b..8f506423 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -41,6 +41,14 @@ --- +## WG21PaperAuthorProfile + +| Function | Parameter types | Return type | Description | +| -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist and one matches the email, returns that profile. If multiple exist and no email is provided, returns the first. If multiple exist and the supplied email matches none, creates a new profile with that email. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | + +--- + ## DiscordProfile | Function | Parameter types | Return type | Description | diff --git a/docs/service_api/cppa_youtube_script_tracker.md b/docs/service_api/cppa_youtube_script_tracker.md new file mode 100644 index 00000000..a429912a --- /dev/null +++ b/docs/service_api/cppa_youtube_script_tracker.md @@ -0,0 +1,145 @@ +# cppa_youtube_script_tracker — Service API + +**Module path:** `cppa_youtube_script_tracker.services` +**Description:** YouTube channel metadata, video metadata, transcript state, and speaker links for C++ conference talks. Single place for all writes to `cppa_youtube_script_tracker` models. Speaker profiles live in `cppa_user_tracker.YoutubeSpeaker`. + +**Type notation:** Model types refer to `cppa_youtube_script_tracker.models` unless noted. `YoutubeSpeaker` refers to `cppa_user_tracker.models.YoutubeSpeaker`. + +--- + +## YouTubeChannel + +| Function | Parameter types | Return type | Description | +| ----------------------- | ------------------------------------------------ | ---------------- | ------------------------------------------------------------------------------- | +| `get_or_create_channel` | `channel_id: str`, `channel_title: str = ""` | `YouTubeChannel` | Get or create channel by `channel_id`; updates `channel_title` if it has changed. | + +--- + +## YouTubeVideo + +| Function | Parameter types | Return type | Description | +| ---------------------- | ---------------------------------------------------------------------------------- | ------------------------ | ----------------------------------------------------------------------------------------------- | +| `get_or_create_video` | `video_id: str`, `channel: YouTubeChannel \| None`, `metadata_dict: dict` | `tuple[YouTubeVideo, bool]` | Get or create video by `video_id`. Raises `ValueError` if `video_id` is empty. | +| `update_video_transcript` | `video: YouTubeVideo`, `transcript_path: str` | `YouTubeVideo` | Set `has_transcript=True` and `transcript_path` on the video; saves `update_fields`. | + +`metadata_dict` accepted keys: + +| Key | Type | Notes | +| ------------------ | ----------------- | -------------------------------------------------- | +| `title` | str | | +| `description` | str | | +| `published_at` | datetime or str | ISO string is parsed via `parse_datetime` | +| `duration_seconds` | int | | +| `view_count` | int \| None | | +| `like_count` | int \| None | | +| `comment_count` | int \| None | | +| `search_term` | str | Search term used to discover the video | +| `scraped_at` | datetime or str | ISO string is parsed via `parse_datetime` | + +Tags are not part of `metadata_dict`; use `get_or_create_tag` and `link_tag_to_video` (in this module) to associate tags with a video after creating or fetching it. + +--- + +## YouTubeVideoSpeaker + +| Function | Parameter types | Return type | Description | +| --------------------- | --------------------------------------------- | -------------------- | -------------------------------------------------------- | +| `link_speaker_to_video` | `video: YouTubeVideo`, `speaker: YoutubeSpeaker` | `YouTubeVideoSpeaker` | Get-or-create M2M link between a video and a speaker. | + +--- + +## YoutubeSpeaker (in cppa_user_tracker) + +| Function | Parameter types | Return type | Description | +| ------------------------------- | -------------------------------------------------- | ---------------------------- | -------------------------------------------------------------------------------- | +| `get_or_create_youtube_speaker` | `external_id: str`, `display_name: str = ""`, `identity: Identity \| None = None` | `tuple[YoutubeSpeaker, bool]` | Get or create a speaker by `external_id`; updates `display_name` when provided. Raises `ValueError` if `external_id` is empty. | + +**Module path:** `cppa_user_tracker.services` + +--- + +## Preprocessor + +**Module path:** `cppa_youtube_script_tracker.preprocessor` + +| Function | Parameter types | Return type | Description | +| -------------------------------- | ------------------------------------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------- | +| `preprocess_youtube_for_pinecone` | `failed_ids: list[str]`, `final_sync_at: datetime \| None` | `tuple[list[dict], bool]` | Build Pinecone sync documents for YouTube videos. Returns `(docs, is_chunked=False)`. | + +Each document dict has: +- `content` — Title, speakers, channel, published date, description, and transcript text (if available). +- `metadata["doc_id"]` — `"youtube-{video_id}"`. +- `metadata["ids"]` — DB primary key of the `YouTubeVideo` row (for retry tracking). +- `metadata["type"]` — `"youtube"`. +- `metadata["url"]` — `"https://www.youtube.com/watch?v={video_id}"`. +- `metadata["title"]`, `metadata["author"]` (comma-separated speaker names), `metadata["channel"]`, `metadata["timestamp"]` (Unix timestamp), `metadata["has_transcript"]`. + +--- + +## Workspace helpers + +**Module path:** `cppa_youtube_script_tracker.workspace` + +| Function | Return type | Description | +| ----------------------- | ----------- | --------------------------------------------------------------------------- | +| `get_workspace_root()` | `Path` | `workspace/cppa_youtube_script_tracker/` | +| `get_raw_dir()` | `Path` | `workspace/raw/cppa_youtube_script_tracker/` (permanent JSON archive) | +| `get_raw_transcripts_dir()` | `Path` | `workspace/raw/cppa_youtube_script_tracker/transcripts/` (permanent VTT archive) | +| `get_metadata_queue_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/metadata/` (short-lived; moved after persist) | +| `get_raw_metadata_path(video_id)` | `Path` | Raw metadata JSON archive path for a video. | +| `get_metadata_queue_path(video_id)` | `Path` | Metadata queue JSON path for a video. | +| `get_transcript_path(video_id, lang="en")` | `Path` | VTT path for a video. | +| `iter_metadata_queue_jsons()` | `Iterator[Path]` | Yield all `*.json` files in the metadata queue directory. | + +--- + +## Fetcher + +**Module path:** `cppa_youtube_script_tracker.fetcher` + +| Function | Parameter types | Return type | Description | +| --------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------- | +| `fetch_videos` | `published_after: datetime`, `published_before: datetime`, `channel_title: str \| None = None`, `skip_video_ids: set[str] \| None = None`, `min_duration_seconds: int = 0` | `list[dict]` | Fetch video metadata from YouTube Data API v3 for the given time window. Returns normalised metadata dicts. | + +Each returned dict contains the following keys: + +| Key | Type | Notes | +| ------------------ | ----------- | ----------------------------------------------------------------- | +| `video_id` | str | YouTube video ID | +| `title` | str | | +| `description` | str | | +| `channel_id` | str | | +| `channel_title` | str | | +| `published_at` | str | ISO 8601 datetime string from API | +| `duration_seconds` | int | Parsed from ISO 8601 duration (e.g. `PT1H2M10S`) | +| `view_count` | int \| None | | +| `like_count` | int \| None | | +| `comment_count` | int \| None | | +| `tags` | list | | +| `search_term` | str | Query used to discover the video | +| `scraped_at` | str | ISO 8601 datetime when the API call was made | + +**`channel_title` behaviour:** If `channel_title` matches a key in the `C_PLUS_PLUS_CHANNELS` dict, the API call is filtered by that channel's ID. If `channel_title` is unrecognised, a keyword search by name is used. If `channel_title` is `None`, all known C++ channels are searched. + +**Requires:** `YOUTUBE_API_KEY` setting. Raises `ValueError` if missing. Raises `ImportError` if `google-api-python-client` is not installed. + +--- + +## Transcript downloader + +**Module path:** `cppa_youtube_script_tracker.transcript` + +| Function | Parameter types | Return type | Description | +| --------------- | ----------------------------------------------------------------------------- | --------------- | ---------------------------------------------------------------------------------------------------------------- | +| `download_vtt` | `video_id: str`, `output_dir: Path`, `cookies_file: str \| None = None` | `Path \| None` | Download English VTT subtitles for `video_id` into `output_dir`. Returns path to the `.vtt` file, or `None` if not found. | + +Tries manual captions first, then auto-generated (`writeautomaticsub`). The output file is written as `{video_id}.en.vtt`; falls back to any `{video_id}*.vtt` file in `output_dir` if the expected name is not present. + +**Requires:** `yt-dlp`. Raises `ImportError` if not installed. + +--- + +## Related docs + +- [Schema.md](../Schema.md) – Section 10: CPPA YouTube Script Tracker. +- [service_api/README.md](README.md) – Service API index. diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index 5621c768..23ffdec6 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -9,6 +9,7 @@ import time from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Iterator, Optional +from urllib.parse import parse_qs, urlparse import requests @@ -18,6 +19,24 @@ logger = logging.getLogger(__name__) +def _make_aware(dt: datetime) -> datetime: + """Return dt as UTC-aware; if naive, assume UTC.""" + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + +def _in_date_range( + dt: datetime, + start_time: Optional[datetime], + end_time: Optional[datetime], +) -> bool: + """Return True if dt falls within [start_time, end_time] (UTC-aware, both inclusive).""" + if start_time and dt < _make_aware(start_time): + return False + if end_time and dt > _make_aware(end_time): + return False + return True + + def fetch_user_from_github( client: GitHubAPIClient, username: str = "", @@ -42,6 +61,54 @@ def fetch_user_from_github( return None +def _is_first_page_url(url: str) -> bool: + """Return True if the URL's page= query param is 1 or absent (GitHub default).""" + try: + pages = parse_qs(urlparse(url).query).get("page") + return int(pages[0]) == 1 if pages else True + except (ValueError, IndexError): + return False + + +def _yield_commit_with_stats( + client: GitHubAPIClient, + owner: str, + repo: str, + commit: dict, + start_time: Optional[datetime], + end_time: Optional[datetime], +) -> Iterator[dict]: + """Filter a single commit list entry by date range, fetch full stats, and yield.""" + commit_date_str = commit.get("commit", {}).get("author", {}).get( + "date" + ) or commit.get("commit", {}).get("committer", {}).get("date") + if commit_date_str: + try: + commit_dt = datetime.fromisoformat(commit_date_str.replace("Z", "+00:00")) + if not _in_date_range(commit_dt, start_time, end_time): + return + except Exception as e: + logger.debug("Failed to parse commit date '%s': %s", commit_date_str, e) + + try: + commit_with_stats = client.rest_request( + f"/repos/{owner}/{repo}/commits/{commit['sha']}" + ) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code in (502, 503, 504): + logger.warning( + "Aborting commit sync at %s for %s/%s after HTTP %s: %s", + commit["sha"][:7], + owner, + repo, + e.response.status_code, + e, + ) + raise + raise + yield commit_with_stats + + def fetch_commits_from_github( client: GitHubAPIClient, owner: str, @@ -50,110 +117,127 @@ def fetch_commits_from_github( end_time: Optional[datetime] = None, etag_cache: Optional[Any] = None, ) -> Iterator[dict]: - """Fetch commits from GitHub API (paginated). Yields commit dicts with stats. - If etag_cache is provided, uses rest_request_conditional for the list GET. + """Fetch commits from GitHub API oldest-to-newest using Link header pagination. + + When GitHub includes rel="last", walks backward (last → prev → … → page 1) so + commits are yielded oldest-first. When rel="last" is omitted but rel="next" + is present (e.g. some since/until responses), follows "next" to fetch all + pages, then yields oldest-first. True single-page responses have neither link. + + The page-1 list response is cached in memory so when backward traversal returns to + page 1 via the "prev" link, no duplicate request is made. + + If etag_cache is provided, a conditional GET is used for page 1; a 304 means no + new commits exist in the requested date window and the function returns immediately. """ - logger.debug(f"Fetching commits for {owner}/{repo} from {start_time} to {end_time}") - page = 1 + logger.debug( + "Fetching commits for %s/%s from %s to %s", owner, repo, start_time, end_time + ) + per_page = 100 since_iso = start_time.isoformat() if start_time else "" until_iso = end_time.isoformat() if end_time else "" - - while True: - params = { - "per_page": per_page, - "page": page, - } - if start_time: - params["since"] = start_time.isoformat() - if end_time: - params["until"] = end_time.isoformat() - - response_etag = None - if etag_cache is not None: - etag = etag_cache.get("commits", page, since_iso, until_iso) - data, response_etag = client.rest_request_conditional( - f"/repos/{owner}/{repo}/commits", params=params, etag=etag + endpoint = f"/repos/{owner}/{repo}/commits" + + params: dict = {"per_page": per_page, "page": 1} + if start_time: + params["since"] = start_time.isoformat() + if end_time: + params["until"] = end_time.isoformat() + + # Fetch page 1 to discover total pages via Link header. + first_page_etag: Optional[str] = None + if etag_cache is not None: + etag = etag_cache.get("commits", 1, since_iso, until_iso) + first_page_data, first_page_etag, first_page_links = ( + client.rest_request_conditional_with_all_links( + endpoint, params=params, etag=etag ) - if data is None: - logger.debug("Commits list page %s: 304 Not Modified, skipping", page) - page += 1 - time.sleep(0.2) - continue - commits = data - else: - commits = client.rest_request(f"/repos/{owner}/{repo}/commits", params) + ) + if first_page_data is None: + logger.debug("Commits list page 1: 304 Not Modified, nothing to process") + return + else: + first_page_data, first_page_links = client.rest_request_with_all_links( + endpoint, params + ) - if not commits: - logger.debug(f"No more commits found at page {page}") - break - logger.debug(f"Fetched {len(commits)} commits from page {page}") + if not first_page_data: + logger.debug("No commits found for %s/%s", owner, repo) + return - for commit in reversed(commits): - commit_date_str = commit.get("commit", {}).get("author", {}).get( - "date" - ) or commit.get("commit", {}).get("committer", {}).get("date") - if commit_date_str: - try: - commit_dt = datetime.fromisoformat( - commit_date_str.replace("Z", "+00:00") - ) + logger.debug( + "Fetched %d commits on page 1 for %s/%s", len(first_page_data), owner, repo + ) - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if commit_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if commit_dt > end_time_aware: - continue - except Exception as e: - logger.debug( - f"Failed to parse commit date '{commit_date_str}': {e}" - ) + last_url = first_page_links.get("last") + next_url = first_page_links.get("next") + + if last_url and not _is_first_page_url(last_url): + # Multiple pages: walk backward from last page to page 1, yielding oldest-first. + current_url: Optional[str] = last_url + while current_url is not None: + if _is_first_page_url(current_url): + # Reuse the already-fetched page-1 data — no extra API request. + page_data = first_page_data + page_links = first_page_links + logger.debug("Backward traversal reached page 1; using cached data") + else: + page_data, page_links = client.rest_request_url_with_all_links( + current_url + ) + logger.debug( + "Fetched %d commits (backward traversal) from %s", + len(page_data) if page_data else 0, + current_url, + ) + time.sleep(0.2) - # Fetch full commit with stats (abort on 502/503/504 so page is not checkpointed and can be retried) - try: - commit_with_stats = client.rest_request( - f"/repos/{owner}/{repo}/commits/{commit['sha']}" + for commit in reversed(page_data or []): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time ) - except requests.exceptions.HTTPError as e: - if e.response is not None and e.response.status_code in ( - 502, - 503, - 504, - ): - logger.warning( - "Aborting commit sync at %s for %s/%s after HTTP %s: %s", - commit["sha"][:7], - owner, - repo, - e.response.status_code, - e, - ) - raise - raise - yield commit_with_stats - if etag_cache is not None and response_etag: - etag_cache.set("commits", page, since_iso, until_iso, response_etag) + current_url = page_links.get("prev") + + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) + return - if len(commits) < per_page: + if next_url: + # rel="last" omitted but rel="next" is present: fetch remaining pages, oldest-first. + pages: list[list[dict]] = [first_page_data] + current_links = first_page_links + while current_links.get("next"): + forward_url = current_links["next"] + page_data, current_links = client.rest_request_url_with_all_links( + forward_url + ) logger.debug( - f"Last page reached (got {len(commits)} commits, expected {per_page})" + "Fetched %d commits (forward pagination) from %s", + len(page_data) if page_data else 0, + forward_url, ) - break - page += 1 - time.sleep(0.2) + time.sleep(0.2) + pages.append(page_data or []) + + for page_data in reversed(pages): + for commit in reversed(page_data): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) + return + + # No pagination: neither next nor a multi-page last link. + logger.debug("Single page of commits; processing in reverse order") + for commit in reversed(first_page_data): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) def fetch_comments_from_github( @@ -200,26 +284,11 @@ def fetch_comments_from_github( if created_str: try: c_dt = datetime.fromisoformat(created_str.replace("Z", "+00:00")) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if c_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if c_dt > end_time_aware: - continue + if not _in_date_range(c_dt, start_time, end_time): + continue except Exception as e: logger.debug(f"Failed to parse comment date '{created_str}': {e}") + continue results.append(comment) @@ -232,139 +301,6 @@ def fetch_comments_from_github( return results -def fetch_issues_from_github( - client: GitHubAPIClient, - owner: str, - repo: str, - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - etag_cache: Optional[Any] = None, -) -> Iterator[dict]: - """Fetch issues from GitHub API (paginated). Yields issue dicts with comments. - Uses GitHub's Link header (rel=\"next\") for pagination per API docs. - If etag_cache is provided, uses conditional GET for the first page when using endpoint+params. - """ - logger.debug(f"Fetching issues for {owner}/{repo} from {start_time} to {end_time}") - per_page = 100 - since_iso = start_time.isoformat() if start_time else "" - endpoint = f"/repos/{owner}/{repo}/issues" - next_url: Optional[str] = None - page_num = 1 - - while True: - # Fresh each page: rest_request_url does not return an ETag; do not reuse - # page N-1's tag when caching page N (conditional path sets this below). - response_etag: Optional[str] = None - try: - if next_url is not None: - issues, next_url = client.rest_request_url(next_url) - page_num += 1 - else: - params = { - "state": "all", - "per_page": per_page, - "page": page_num, - "sort": "updated", - "direction": "asc", - } - if start_time: - params["since"] = start_time.isoformat() - if etag_cache is not None: - etag = etag_cache.get("issues", page_num, since_iso, "") - data, response_etag, next_url = ( - client.rest_request_conditional_with_link( - endpoint, params=params, etag=etag - ) - ) - if data is None: - logger.debug( - "Issues list page %s: 304 Not Modified, skipping", - page_num, - ) - page_num += 1 - time.sleep(0.2) - continue - issues = data - else: - issues, next_url = client.rest_request_with_link(endpoint, params) - except requests.exceptions.HTTPError as e: - if e.response is not None and e.response.status_code == 422: - logger.debug( - "Issues list: 422 Unprocessable Entity, stopping pagination" - ) - break - raise - - if not issues: - logger.debug("No more issues found") - break - - # Filter out PRs (issues endpoint returns both issues and PRs) - raw_issues = issues - issues = [i for i in raw_issues if "pull_request" not in i] - logger.debug( - "Fetched %s issues (excluding PRs) from page %s", - len(issues), - page_num, - ) - - for issue in issues: - updated_str = issue.get("updated_at") or issue.get("created_at") - if not updated_str: - continue - try: - issue_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) - except (ValueError, TypeError) as e: - logger.debug(f"Failed to parse issue date '{updated_str}': {e}") - continue - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if issue_dt < start_time_aware: - continue - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if issue_dt > end_time_aware: - continue - - issue_number = issue.get("number") - if issue_number is not None: - # Fetch full issue detail (list endpoint returns summary only) - try: - full_issue = client.rest_request( - f"/repos/{owner}/{repo}/issues/{issue_number}" - ) - if full_issue and isinstance(full_issue, dict): - issue = full_issue - except Exception as e: - logger.debug("Failed to fetch full issue #%s: %s", issue_number, e) - logger.debug(f"Fetching comments for issue #{issue_number}") - comments = fetch_comments_from_github( - client, owner, repo, issue_number, start_time, end_time - ) - logger.debug( - f"Found {len(comments)} comments for issue #{issue_number}" - ) - # Yield nested format: { issue_info: , comments: [...] } - yield {"issue_info": issue, "comments": comments} - - if etag_cache is not None and response_etag: - etag_cache.set("issues", page_num, since_iso, "", response_etag) - - if next_url is None: - logger.debug('Last page reached (no Link rel="next")') - break - time.sleep(0.2) - - def fetch_pr_reviews_from_github( client: GitHubAPIClient, owner: str, @@ -402,26 +338,11 @@ def fetch_pr_reviews_from_github( review_dt = datetime.fromisoformat( updated_str.replace("Z", "+00:00") ) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if review_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if review_dt > end_time_aware: - continue + if not _in_date_range(review_dt, start_time, end_time): + continue except Exception as e: logger.debug(f"Failed to parse review date '{updated_str}': {e}") + continue results.append(review) @@ -439,7 +360,7 @@ def fetch_pr_reviews_from_github( return results -def fetch_pull_requests_from_github( +def fetch_issues_and_prs_from_github( client: GitHubAPIClient, owner: str, repo: str, @@ -447,102 +368,173 @@ def fetch_pull_requests_from_github( end_time: Optional[datetime] = None, etag_cache: Optional[Any] = None, ) -> Iterator[dict]: - """Fetch pull requests from GitHub API (paginated). Yields PR dicts with comments and reviews. - If etag_cache is provided, uses rest_request_conditional for the list GET. + """Fetch issues and PRs from GitHub using a single /issues list endpoint. + + GitHub's issues API returns both issues and pull requests; this function routes each + item by the presence of the "pull_request" key: + - Issues → yield {"issue_info": , "comments": [...]} + - PRs → yield {"pr_info": , "comments": [...], "reviews": [...]} + + Uses Link-header pagination (direction=asc, sort=updated) so items are processed + oldest-updated-first. + + When etag_cache is provided, list requests built from query params use conditional + GET (If-None-Match); ETags are keyed by list type, page, and since_iso in the cache. + A 304 response has no JSON for that page; pagination may continue by advancing + ``page`` while still on the params path, or by following ``Link`` after a 200. + + Requests made via full ``next`` URLs (``rest_request_url``) do not use the ETag cache. """ - logger.debug(f"Fetching PRs for {owner}/{repo} from {start_time} to {end_time}") - page = 1 + logger.debug( + "Fetching issues+PRs for %s/%s from %s to %s", owner, repo, start_time, end_time + ) per_page = 100 + since_iso = start_time.isoformat() if start_time else "" + endpoint = f"/repos/{owner}/{repo}/issues" + next_url: Optional[str] = None + page_num = 1 - while True: - params = { + def _issues_list_params(page: int) -> dict: + params: dict = { "state": "all", "per_page": per_page, "page": page, "sort": "updated", - "direction": "desc", + "direction": "asc", } - response_etag = None - if etag_cache is not None: - etag = etag_cache.get("pulls", page, "", "") - data, response_etag = client.rest_request_conditional( - f"/repos/{owner}/{repo}/pulls", params=params, etag=etag - ) - if data is None: - logger.debug("Pulls list page %s: 304 Not Modified, skipping", page) - page += 1 - time.sleep(0.2) + if start_time: + params["since"] = start_time.isoformat() + return params + + def _yield_issue_pr_items_for_list_page(items: list) -> Iterator[dict]: + for item in items: + updated_str = item.get("updated_at") or item.get("created_at") + if not updated_str: + continue + try: + item_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) + except (ValueError, TypeError) as e: + logger.debug("Failed to parse item date '%s': %s", updated_str, e) continue - prs = data - else: - prs = client.rest_request(f"/repos/{owner}/{repo}/pulls", params) - if not prs: - logger.debug(f"No more PRs found at page {page}") - break + if not _in_date_range(item_dt, start_time, end_time): + continue - flag = False - for pr in prs: - updated_str = pr.get("updated_at") or pr.get("created_at") - pr_number = pr.get("number") - logger.debug("Fetching PR #%s with updated_str: %s", pr_number, updated_str) - if updated_str: + number = item.get("number") + if number is None: + continue + + if "pull_request" in item: + # PR: fetch full detail from /pulls endpoint, then comments + reviews. + try: + full_pr = client.rest_request( + f"/repos/{owner}/{repo}/pulls/{number}" + ) + if full_pr and isinstance(full_pr, dict): + item = full_pr + except Exception as e: + logger.debug("Failed to fetch full PR #%s: %s", number, e) + logger.debug("Fetching comments for PR #%s", number) + comments = fetch_comments_from_github( + client, owner, repo, number, start_time, end_time + ) + time.sleep(0.2) + logger.debug("Fetching reviews for PR #%s", number) + reviews = fetch_pr_reviews_from_github( + client, owner, repo, number, start_time, end_time + ) + time.sleep(0.2) + yield {"pr_info": item, "comments": comments, "reviews": reviews} + else: + # Issue: fetch full detail from /issues endpoint, then comments. try: - pr_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if pr_dt < start_time_aware: - flag = True - break - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if pr_dt > end_time_aware: - continue + full_issue = client.rest_request( + f"/repos/{owner}/{repo}/issues/{number}" + ) + if full_issue and isinstance(full_issue, dict): + item = full_issue except Exception as e: - logger.debug("Failed to parse PR date '%s': %s", updated_str, e) + logger.debug("Failed to fetch full issue #%s: %s", number, e) + logger.debug("Fetching comments for issue #%s", number) + comments = fetch_comments_from_github( + client, owner, repo, number, start_time, end_time + ) + logger.debug("Found %d comments for issue #%s", len(comments), number) + yield {"issue_info": item, "comments": comments} + + # Phase 1: params-based list requests (optional conditional GET + ETag cache). + while next_url is None: + response_etag: Optional[str] = None + try: + params = _issues_list_params(page_num) + if etag_cache is not None: + etag = etag_cache.get("issues_and_prs", page_num, since_iso, "") + data, response_etag, next_url = ( + client.rest_request_conditional_with_link( + endpoint, params=params, etag=etag + ) + ) + if data is None: + logger.debug( + "Issues+PRs list page %s: 304 Not Modified, skipping", + page_num, + ) + page_num += 1 + time.sleep(0.2) continue + items = data + else: + items, next_url = client.rest_request_with_link(endpoint, params) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code == 422: + logger.debug( + "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" + ) + return + raise - if pr_number is None: - continue + if not items: + logger.debug("No more issues/PRs found") + break - # Fetch full PR detail (list endpoint returns summary only) - try: - full_pr = client.rest_request( - f"/repos/{owner}/{repo}/pulls/{pr_number}" - ) - if full_pr and isinstance(full_pr, dict): - pr = full_pr - except Exception as e: - logger.debug("Failed to fetch full PR #%s: %s", pr_number, e) - - logger.debug("Fetching comments for PR #%s", pr_number) - comments = fetch_comments_from_github( - client, owner, repo, pr_number, start_time, end_time - ) - time.sleep(0.2) - logger.debug("Fetching reviews for PR #%s", pr_number) - reviews = fetch_pr_reviews_from_github( - client, owner, repo, pr_number, start_time, end_time - ) - time.sleep(0.2) - # Yield nested format: { pr_info: , comments: [...], reviews: [...] } - yield {"pr_info": pr, "comments": comments, "reviews": reviews} + logger.debug( + "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num + ) + + yield from _yield_issue_pr_items_for_list_page(items) if etag_cache is not None and response_etag: - etag_cache.set("pulls", page, "", "", response_etag) + etag_cache.set("issues_and_prs", page_num, since_iso, "", response_etag) - if len(prs) < per_page or flag: - logger.debug(f"Last page reached (got {len(prs)} PRs, expected {per_page})") + if next_url is None: + logger.debug('Last page reached (no Link rel="next")') break - page += 1 + break + + # Phase 2: follow Link rel="next" URLs (full GET; no ETag cache). + while next_url: time.sleep(0.2) + try: + items, next_url = client.rest_request_url(next_url) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code == 422: + logger.debug( + "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" + ) + return + raise + page_num += 1 + + if not items: + logger.debug("No more issues/PRs found") + break + + logger.debug( + "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num + ) + + yield from _yield_issue_pr_items_for_list_page(items) + + if next_url is None: + logger.debug('Last page reached (no Link rel="next")') + break diff --git a/github_activity_tracker/preprocessors/github_preprocess.py b/github_activity_tracker/preprocessors/github_preprocess.py index 11643a6a..8911f119 100644 --- a/github_activity_tracker/preprocessors/github_preprocess.py +++ b/github_activity_tracker/preprocessors/github_preprocess.py @@ -18,7 +18,7 @@ "content": , "metadata": { "doc_id": , - "ids": ":issue:" or ":pr:", + "source_ids": ":issue:" or ":pr:", "type": "issue" | "pr", "number": , "title": , @@ -39,7 +39,7 @@ import logging from datetime import datetime, timezone from pathlib import Path -from typing import Any, Generator +from typing import Any, Generator, Literal from operations.md_ops.issue_to_md import issue_json_to_md from operations.md_ops.pr_to_md import pr_json_to_md @@ -108,6 +108,11 @@ def _iter_json_files( yield path, data +def get_ids_for_pinecone(repo: str, type: Literal["issue", "pr"], number: int) -> str: + """Get the ids for Pinecone from a repo, type, and number.""" + return f"{repo}:{type}:{number}" + + # --------------------------------------------------------------------------- # Public iterators # --------------------------------------------------------------------------- @@ -171,7 +176,7 @@ def build_issue_document( "content": content, "metadata": { "doc_id": html_url, - "ids": f"{repo}:issue:{number}", + "source_ids": get_ids_for_pinecone(repo, "issue", number), "type": "issue", "number": number, "title": (info.get("title") or "").strip(), @@ -216,7 +221,7 @@ def build_pr_document( "content": content, "metadata": { "doc_id": html_url, - "ids": f"{repo}:pr:{number}", + "source_ids": get_ids_for_pinecone(repo, "pr", number), "type": "pr", "number": number, "title": (info.get("title") or "").strip(), @@ -264,7 +269,7 @@ def preprocess_issues( for path, data in iter_raw_issue_jsons(owner, repo): info = data.get("issue_info") or {} number = info.get("number") or -1 - ids_val = f"{repo}:issue:{number}" + ids_val = get_ids_for_pinecone(repo, "issue", number) is_failed = ids_val in failed_set updated_at = _parse_updated_at(info) @@ -322,7 +327,7 @@ def preprocess_prs( for path, data in iter_raw_pr_jsons(owner, repo): info = data.get("pr_info") or {} number = info.get("number") or -1 - ids_val = f"{repo}:pr:{number}" + ids_val = get_ids_for_pinecone(repo, "pr", number) is_failed = ids_val in failed_set updated_at = _parse_updated_at(info) diff --git a/github_activity_tracker/services.py b/github_activity_tracker/services.py index 41a8d80a..7a5e6510 100644 --- a/github_activity_tracker/services.py +++ b/github_activity_tracker/services.py @@ -196,7 +196,6 @@ def create_or_update_commit( commit_at: Optional[datetime] = None, ) -> tuple[GitCommit, bool]: """Create or update a GitCommit by repo + commit_hash. Returns (commit, created).""" - from datetime import datetime, timezone if not commit_at: commit_at = datetime.now(timezone.utc) @@ -377,7 +376,6 @@ def create_or_update_issue( issue_obj.issue_created_at = issue_created_at issue_obj.issue_updated_at = issue_updated_at issue_obj.issue_closed_at = issue_closed_at - issue_obj.updated_at = datetime.now(timezone.utc) issue_obj.save() return issue_obj, created @@ -465,7 +463,6 @@ def create_or_update_pull_request( pr_obj.pr_updated_at = pr_updated_at pr_obj.pr_merged_at = pr_merged_at pr_obj.pr_closed_at = pr_closed_at - pr_obj.updated_at = datetime.now(timezone.utc) pr_obj.save() return pr_obj, created diff --git a/github_activity_tracker/sync/__init__.py b/github_activity_tracker/sync/__init__.py index c7e0f181..481a6071 100644 --- a/github_activity_tracker/sync/__init__.py +++ b/github_activity_tracker/sync/__init__.py @@ -1,7 +1,7 @@ """ GitHub sync package: read last updated from DB, fetch from GitHub, save via services. -Split by entity: repos, commits, issues, pull_requests. +Split by entity: repos, commits, issues_and_prs. Entry point: sync_github(repo) runs all in order for that repo. Accepts GitHubRepository or any subclass (e.g. BoostLibraryRepository); base fields are used. """ @@ -12,8 +12,7 @@ from typing import TYPE_CHECKING, Optional from .commits import sync_commits -from .issues import sync_issues -from .pull_requests import sync_pull_requests +from .issues_and_prs import sync_issues_and_prs from .repos import sync_repos if TYPE_CHECKING: @@ -25,7 +24,10 @@ def sync_github( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ) -> dict[str, list[int]]: - """Run full sync for one repo: repos (metadata), then commits, issues, pull requests. + """Run full sync for one repo: repos (metadata), then commits, issues and pull requests. + + Issues and PRs are fetched together via a single GitHub /issues list call which + returns both; items are routed internally by the presence of a "pull_request" key. Accepts GitHubRepository or a subclass (e.g. BoostLibraryRepository); the same base row is used, so extended models can be passed and sync will work. @@ -41,6 +43,4 @@ def sync_github( """ sync_repos(repo) sync_commits(repo, start_date=start_date, end_date=end_date) - issue_numbers = sync_issues(repo, start_date=start_date, end_date=end_date) - pr_numbers = sync_pull_requests(repo, start_date=start_date, end_date=end_date) - return {"issues": issue_numbers, "pull_requests": pr_numbers} + return sync_issues_and_prs(repo, start_date=start_date, end_date=end_date) diff --git a/github_activity_tracker/sync/issues.py b/github_activity_tracker/sync/issues.py deleted file mode 100644 index 6cf3e229..00000000 --- a/github_activity_tracker/sync/issues.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Sync GitHub issues (comments, assignees, labels) with the database. - -Flow: -1. Process existing JSON files in workspace///issues/*.json (load → DB → remove file). -2. Fetch from GitHub, save each as issues/.json, persist to DB, then remove the file. -""" - -from __future__ import annotations - -import json -import logging -from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Optional - -from cppa_user_tracker.services import get_or_create_github_account -from github_activity_tracker import fetcher, services -from .raw_source import save_issue_raw_source -from .etag_cache import RedisListETagCache -from github_activity_tracker.workspace import ( - get_issue_json_path, - iter_existing_issue_jsons, -) -from github_ops import get_github_client -from github_ops.client import ConnectionException, RateLimitException -from github_activity_tracker.sync.utils import ( - normalize_issue_json, - parse_datetime, - parse_github_user, -) - -if TYPE_CHECKING: - from github_activity_tracker.models import GitHubRepository - -logger = logging.getLogger(__name__) - - -def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: - """Apply one issue dict (with comments, assignees, labels) to the database. - Accepts flat or nested { issue_info, comments } format.""" - issue_data = normalize_issue_json(issue_data) - user_info = parse_github_user(issue_data.get("user")) - if not user_info["account_id"]: - logger.warning( - "Issue #%s: no user account_id; skipping", - issue_data.get("number", "?"), - ) - return - account, _ = get_or_create_github_account( - github_account_id=user_info["account_id"], - username=user_info["username"], - display_name=user_info["display_name"], - avatar_url=user_info["avatar_url"], - ) - - issue_obj, _ = services.create_or_update_issue( - repo=repo, - account=account, - issue_number=issue_data.get("number"), - issue_id=issue_data.get("id"), - title=issue_data.get("title", ""), - body=issue_data.get("body", ""), - state=issue_data.get("state", "open"), - state_reason=issue_data.get("state_reason", ""), - issue_created_at=parse_datetime(issue_data.get("created_at")), - issue_updated_at=parse_datetime(issue_data.get("updated_at")), - issue_closed_at=parse_datetime(issue_data.get("closed_at")), - ) - - for comment_data in issue_data.get("comments", []): - comment_user_info = parse_github_user(comment_data.get("user")) - if comment_user_info["account_id"]: - comment_account, _ = get_or_create_github_account( - github_account_id=comment_user_info["account_id"], - username=comment_user_info["username"], - display_name=comment_user_info["display_name"], - avatar_url=comment_user_info["avatar_url"], - ) - services.create_or_update_issue_comment( - issue=issue_obj, - account=comment_account, - issue_comment_id=comment_data.get("id"), - body=comment_data.get("body", ""), - issue_comment_created_at=parse_datetime(comment_data.get("created_at")), - issue_comment_updated_at=parse_datetime(comment_data.get("updated_at")), - ) - - assignee_infos = [parse_github_user(a) for a in issue_data.get("assignees", [])] - current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} - for assignee_account in issue_obj.assignees.all(): - if assignee_account.github_account_id not in current_assignee_ids: - services.remove_issue_assignee(issue_obj, assignee_account) - for assignee_info in assignee_infos: - if assignee_info["account_id"]: - assignee_account, _ = get_or_create_github_account( - github_account_id=assignee_info["account_id"], - username=assignee_info["username"], - display_name=assignee_info["display_name"], - avatar_url=assignee_info["avatar_url"], - ) - services.add_issue_assignee(issue_obj, assignee_account) - - for label_data in issue_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_issue_label(issue_obj, label_name) - - logger.debug("Issue #%s: saved to DB", issue_data.get("number")) - - -def _process_existing_issue_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: - """Load each issues/*.json in workspace for this repo, save to DB, remove file. - - Returns: - (count, issue_numbers) — count of processed files and their issue numbers. - """ - owner = repo.owner_account.username - repo_name = repo.repo_name - count = 0 - numbers: list[int] = [] - for path in iter_existing_issue_jsons(owner, repo_name): - try: - data = json.loads(path.read_text(encoding="utf-8")) - _process_issue_data(repo, data) - save_issue_raw_source(owner, repo_name, data) - path.unlink() - number = (data.get("issue_info") or {}).get("number") or data.get("number") - if number is not None: - numbers.append(number) - count += 1 - except Exception as e: - logger.exception("Failed to process %s: %s", path, e) - return count, numbers - - -def sync_issues( - repo: GitHubRepository, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, -) -> list[int]: - """1) Process existing workspace JSONs; 2) Fetch from GitHub, save as JSON, persist to DB, remove file. - - Args: - repo: Repository to sync. - start_date: Override start date (default: last issue updated_at + 1s, or None if no issues). - end_date: Override end date (default: None = no end; stable ETag cache). - - Returns: - List of issue numbers processed during this sync run. - """ - logger.info("sync_issues: starting for repo id=%s (%s)", repo.pk, repo.repo_name) - - owner = repo.owner_account.username - repo_name = repo.repo_name - processed_numbers: list[int] = [] - - try: - # Phase 1: process existing JSON files - n_existing, existing_numbers = _process_existing_issue_jsons(repo) - processed_numbers.extend(existing_numbers) - if n_existing: - logger.info("sync_issues: processed %s existing issue JSON(s)", n_existing) - - # Phase 2: fetch from GitHub, write JSON, persist to DB, remove file - client = get_github_client() - if start_date is None: - last_issue = repo.issues.order_by("-issue_updated_at").first() - if last_issue: - start_date = last_issue.issue_updated_at + timedelta(seconds=1) - # Leave end_date as None when not set so ETag cache semantics stay stable. - - count = 0 - etag_cache = RedisListETagCache(repo_id=repo.pk) - for issue_data in fetcher.fetch_issues_from_github( - client, owner, repo_name, start_date, end_date, etag_cache=etag_cache - ): - issue_number = (issue_data.get("issue_info") or {}).get( - "number" - ) or issue_data.get("number") - if issue_number is None: - continue - json_path = get_issue_json_path(owner, repo_name, issue_number) - json_path.parent.mkdir(parents=True, exist_ok=True) - json_path.write_text( - json.dumps(issue_data, indent=2, default=str), encoding="utf-8" - ) - _process_issue_data(repo, issue_data) - save_issue_raw_source(owner, repo_name, issue_data) - json_path.unlink() - processed_numbers.append(issue_number) - count += 1 - - logger.info( - "sync_issues: finished for repo id=%s; %s existing + %s fetched", - repo.pk, - n_existing, - count, - ) - - except (RateLimitException, ConnectionException) as e: - logger.error("sync_issues: failed for repo id=%s: %s", repo.pk, e) - raise - except Exception as e: - logger.exception("sync_issues: unexpected error for repo id=%s: %s", repo.pk, e) - raise - - return processed_numbers diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py new file mode 100644 index 00000000..d893ba02 --- /dev/null +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -0,0 +1,414 @@ +""" +Sync GitHub issues and pull requests together using a single /issues list API call. + +Flow: +1. Process existing JSON files in workspace///issues/*.json and prs/*.json. +2. Fetch from GitHub via fetch_issues_and_prs_from_github (one endpoint, routes by key). + For each item: save as JSON, persist to DB, remove file. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timedelta +from typing import TYPE_CHECKING, Optional + +from cppa_user_tracker.services import get_or_create_github_account +from github_activity_tracker import fetcher, services +from github_activity_tracker.sync.etag_cache import RedisListETagCache +from github_activity_tracker.sync.raw_source import ( + save_issue_raw_source, + save_pr_raw_source, +) +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, + parse_datetime, + parse_github_user, +) +from github_activity_tracker.workspace import ( + get_issue_json_path, + get_pr_json_path, + iter_existing_issue_jsons, + iter_existing_pr_jsons, +) +from github_ops import get_github_client +from github_ops.client import ConnectionException, RateLimitException + +if TYPE_CHECKING: + from github_activity_tracker.models import GitHubRepository + +logger = logging.getLogger(__name__) + + +def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: + """Apply one issue dict (with comments, assignees, labels) to the database. + Accepts flat or nested { issue_info, comments } format.""" + issue_data = normalize_issue_json(issue_data) + user_info = parse_github_user(issue_data.get("user")) + if not user_info["account_id"]: + logger.warning( + "Issue #%s: no user account_id; skipping", + issue_data.get("number", "?"), + ) + return + account, _ = get_or_create_github_account( + github_account_id=user_info["account_id"], + username=user_info["username"], + display_name=user_info["display_name"], + avatar_url=user_info["avatar_url"], + ) + + issue_obj, _ = services.create_or_update_issue( + repo=repo, + account=account, + issue_number=issue_data.get("number"), + issue_id=issue_data.get("id"), + title=issue_data.get("title", ""), + body=issue_data.get("body", ""), + state=issue_data.get("state", "open"), + state_reason=issue_data.get("state_reason", ""), + issue_created_at=parse_datetime(issue_data.get("created_at")), + issue_updated_at=parse_datetime(issue_data.get("updated_at")), + issue_closed_at=parse_datetime(issue_data.get("closed_at")), + ) + + for comment_data in issue_data.get("comments", []): + comment_user_info = parse_github_user(comment_data.get("user")) + if comment_user_info["account_id"]: + comment_account, _ = get_or_create_github_account( + github_account_id=comment_user_info["account_id"], + username=comment_user_info["username"], + display_name=comment_user_info["display_name"], + avatar_url=comment_user_info["avatar_url"], + ) + services.create_or_update_issue_comment( + issue=issue_obj, + account=comment_account, + issue_comment_id=comment_data.get("id"), + body=comment_data.get("body", ""), + issue_comment_created_at=parse_datetime(comment_data.get("created_at")), + issue_comment_updated_at=parse_datetime(comment_data.get("updated_at")), + ) + + assignee_infos = [parse_github_user(a) for a in issue_data.get("assignees", [])] + current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} + for assignee_account in issue_obj.assignees.all(): + if assignee_account.github_account_id not in current_assignee_ids: + services.remove_issue_assignee(issue_obj, assignee_account) + for assignee_info in assignee_infos: + if assignee_info["account_id"]: + assignee_account, _ = get_or_create_github_account( + github_account_id=assignee_info["account_id"], + username=assignee_info["username"], + display_name=assignee_info["display_name"], + avatar_url=assignee_info["avatar_url"], + ) + services.add_issue_assignee(issue_obj, assignee_account) + + incoming_label_names = { + (label_data.get("name") or "") + for label_data in issue_data.get("labels", []) + if (label_data.get("name") or "") + } + existing_label_names = { + il.label_name for il in issue_obj.labels.all() if il.label_name + } + for label_name in existing_label_names - incoming_label_names: + services.remove_issue_label(issue_obj, label_name) + for label_name in incoming_label_names - existing_label_names: + services.add_issue_label(issue_obj, label_name) + + logger.debug("Issue #%s: saved to DB", issue_data.get("number")) + + +def _process_existing_issue_jsons( + repo: GitHubRepository, +) -> tuple[int, list[int]]: + """Load each issues/*.json in workspace for this repo, save to DB, remove file. + + Returns: + (count, issue_numbers) — count of processed files and their issue numbers. + """ + owner = repo.owner_account.username + repo_name = repo.repo_name + count = 0 + numbers: list[int] = [] + for path in iter_existing_issue_jsons(owner, repo_name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + _process_issue_data(repo, data) + save_issue_raw_source(owner, repo_name, data) + path.unlink() + number = (data.get("issue_info") or {}).get("number") or data.get("number") + if number is not None: + numbers.append(number) + count += 1 + except Exception as e: + logger.exception("Failed to process %s: %s", path, e) + return count, numbers + + +def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: + """Apply one PR dict (with comments, reviews, assignees, labels) to the database. + Accepts flat or nested { pr_info, comments, reviews } format.""" + pr_data = normalize_pr_json(pr_data) + user_info = parse_github_user(pr_data.get("user")) + if not user_info["account_id"]: + logger.warning( + "PR #%s: no user account_id; skipping", + pr_data.get("number", "?"), + ) + return + account, _ = get_or_create_github_account( + github_account_id=user_info["account_id"], + username=user_info["username"], + display_name=user_info["display_name"], + avatar_url=user_info["avatar_url"], + ) + + pr_obj, _ = services.create_or_update_pull_request( + repo=repo, + account=account, + pr_number=pr_data.get("number"), + pr_id=pr_data.get("id"), + title=pr_data.get("title", ""), + body=pr_data.get("body", ""), + state=pr_data.get("state", "open"), + head_hash=pr_data.get("head", {}).get("sha", ""), + base_hash=pr_data.get("base", {}).get("sha", ""), + pr_created_at=parse_datetime(pr_data.get("created_at")), + pr_updated_at=parse_datetime(pr_data.get("updated_at")), + pr_merged_at=parse_datetime(pr_data.get("merged_at")), + pr_closed_at=parse_datetime(pr_data.get("closed_at")), + ) + + for comment_data in pr_data.get("comments", []): + comment_user_info = parse_github_user(comment_data.get("user")) + if comment_user_info["account_id"]: + comment_account, _ = get_or_create_github_account( + github_account_id=comment_user_info["account_id"], + username=comment_user_info["username"], + display_name=comment_user_info["display_name"], + avatar_url=comment_user_info["avatar_url"], + ) + services.create_or_update_pr_comment( + pr=pr_obj, + account=comment_account, + pr_comment_id=comment_data.get("id"), + body=comment_data.get("body", ""), + pr_comment_created_at=parse_datetime(comment_data.get("created_at")), + pr_comment_updated_at=parse_datetime(comment_data.get("updated_at")), + ) + + for review_data in pr_data.get("reviews", []): + review_user_info = parse_github_user(review_data.get("user")) + if review_user_info["account_id"]: + review_account, _ = get_or_create_github_account( + github_account_id=review_user_info["account_id"], + username=review_user_info["username"], + display_name=review_user_info["display_name"], + avatar_url=review_user_info["avatar_url"], + ) + services.create_or_update_pr_review( + pr=pr_obj, + account=review_account, + pr_review_id=review_data.get("id"), + body=review_data.get("body", ""), + in_reply_to_id=review_data.get("in_reply_to_id"), + pr_review_created_at=parse_datetime(review_data.get("created_at")), + pr_review_updated_at=parse_datetime(review_data.get("updated_at")), + ) + + assignee_infos = [parse_github_user(a) for a in pr_data.get("assignees", [])] + current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} + for assignee_account in pr_obj.assignees.all(): + if assignee_account.github_account_id not in current_assignee_ids: + services.remove_pr_assignee(pr_obj, assignee_account) + for assignee_info in assignee_infos: + if assignee_info["account_id"]: + assignee_account, _ = get_or_create_github_account( + github_account_id=assignee_info["account_id"], + username=assignee_info["username"], + display_name=assignee_info["display_name"], + avatar_url=assignee_info["avatar_url"], + ) + services.add_pr_assignee(pr_obj, assignee_account) + + incoming_pr_label_names = { + (label_data.get("name") or "") + for label_data in pr_data.get("labels", []) + if (label_data.get("name") or "") + } + existing_pr_label_names = { + pl.label_name for pl in pr_obj.labels.all() if pl.label_name + } + for label_name in existing_pr_label_names - incoming_pr_label_names: + services.remove_pull_request_label(pr_obj, label_name) + for label_name in incoming_pr_label_names - existing_pr_label_names: + services.add_pull_request_label(pr_obj, label_name) + + logger.debug("PR #%s: saved to DB", pr_data.get("number")) + + +def _process_existing_pr_jsons( + repo: GitHubRepository, +) -> tuple[int, list[int]]: + """Load each prs/*.json in workspace for this repo, save to DB, remove file. + + Returns: + (count, pr_numbers) — count of processed files and their PR numbers. + """ + owner = repo.owner_account.username + repo_name = repo.repo_name + count = 0 + numbers: list[int] = [] + for path in iter_existing_pr_jsons(owner, repo_name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + _process_pr_data(repo, data) + save_pr_raw_source(owner, repo_name, data) + path.unlink() + number = (data.get("pr_info") or {}).get("number") or data.get("number") + if number is not None: + numbers.append(number) + count += 1 + except Exception as e: + logger.exception("Failed to process %s: %s", path, e) + return count, numbers + + +def sync_issues_and_prs( + repo: GitHubRepository, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, +) -> dict[str, list[int]]: + """Sync issues and PRs for a repo using a single GitHub /issues list call. + + 1. Process any existing issue/PR JSON files left from a previous interrupted run. + 2. Determine the start date as the later (max) of the last-seen issue and PR update times. + 3. Fetch items via fetch_issues_and_prs_from_github; each item is routed by key: + - "issue_info" → persisted as an issue + - "pr_info" → persisted as a pull request + + Args: + repo: Repository to sync. + start_date: Override start date (default: derived from DB; see below). + end_date: Override end date (default: None = no upper bound). + + Returns: + {"issues": [], "pull_requests": []} + """ + logger.info( + "sync_issues_and_prs: starting for repo id=%s (%s)", + repo.pk, + repo.repo_name, + ) + + owner = repo.owner_account.username + repo_name = repo.repo_name + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] + + try: + # Phase 1: process any JSON files left from a previous interrupted run. + n_issues, existing_issue_nums = _process_existing_issue_jsons(repo) + issue_numbers.extend(existing_issue_nums) + n_prs, existing_pr_nums = _process_existing_pr_jsons(repo) + pr_numbers.extend(existing_pr_nums) + if n_issues or n_prs: + logger.info( + "sync_issues_and_prs: processed %s existing issue JSON(s), %s existing PR JSON(s)", + n_issues, + n_prs, + ) + + # Phase 2: determine start date as max(last issue, last PR) +1s — shared /issues timeline. + if start_date is None: + last_issue = repo.issues.order_by("-issue_updated_at").first() + last_pr = repo.pull_requests.order_by("-pr_updated_at").first() + + issue_date = ( + (last_issue.issue_updated_at + timedelta(seconds=1)) + if last_issue and last_issue.issue_updated_at is not None + else None + ) + pr_date = ( + (last_pr.pr_updated_at + timedelta(seconds=1)) + if last_pr and last_pr.pr_updated_at is not None + else None + ) + + if issue_date and pr_date: + start_date = max(issue_date, pr_date) + else: + start_date = issue_date or pr_date + + # Phase 3: fetch from GitHub, write JSON, persist to DB, remove file. + client = get_github_client() + etag_cache = RedisListETagCache(repo_id=repo.pk) + count_issues = 0 + count_prs = 0 + + for item in fetcher.fetch_issues_and_prs_from_github( + client, + owner, + repo_name, + start_date, + end_date, + etag_cache=etag_cache, + ): + if "pr_info" in item: + pr_number = (item["pr_info"] or {}).get("number") + if pr_number is None: + continue + json_path = get_pr_json_path(owner, repo_name, pr_number) + json_path.parent.mkdir(parents=True, exist_ok=True) + json_path.write_text( + json.dumps(item, indent=2, default=str), encoding="utf-8" + ) + _process_pr_data(repo, item) + save_pr_raw_source(owner, repo_name, item) + json_path.unlink() + pr_numbers.append(pr_number) + count_prs += 1 + else: + issue_number = (item.get("issue_info") or {}).get("number") or item.get( + "number" + ) + if issue_number is None: + continue + json_path = get_issue_json_path(owner, repo_name, issue_number) + json_path.parent.mkdir(parents=True, exist_ok=True) + json_path.write_text( + json.dumps(item, indent=2, default=str), encoding="utf-8" + ) + _process_issue_data(repo, item) + save_issue_raw_source(owner, repo_name, item) + json_path.unlink() + issue_numbers.append(issue_number) + count_issues += 1 + + logger.info( + "sync_issues_and_prs: finished for repo id=%s; " + "%s existing issues + %s fetched; %s existing PRs + %s fetched", + repo.pk, + n_issues, + count_issues, + n_prs, + count_prs, + ) + + except (RateLimitException, ConnectionException) as e: + logger.error("sync_issues_and_prs: failed for repo id=%s: %s", repo.pk, e) + raise + except Exception as e: + logger.exception( + "sync_issues_and_prs: unexpected error for repo id=%s: %s", + repo.pk, + e, + ) + raise + + return {"issues": issue_numbers, "pull_requests": pr_numbers} diff --git a/github_activity_tracker/sync/pull_requests.py b/github_activity_tracker/sync/pull_requests.py deleted file mode 100644 index b9a81d5a..00000000 --- a/github_activity_tracker/sync/pull_requests.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -Sync GitHub pull requests (reviews, comments, assignees, labels) with the database. - -Flow: -1. Process existing JSON files in workspace///prs/*.json (load → DB → remove file). -2. Fetch from GitHub, save each as prs/.json, persist to DB, then remove the file. -""" - -from __future__ import annotations - -import json -import logging -from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Optional - -from cppa_user_tracker.services import get_or_create_github_account -from github_activity_tracker import fetcher, services -from .raw_source import save_pr_raw_source -from .etag_cache import RedisListETagCache -from github_activity_tracker.workspace import ( - get_pr_json_path, - iter_existing_pr_jsons, -) -from github_ops import get_github_client -from github_ops.client import ConnectionException, RateLimitException -from github_activity_tracker.sync.utils import ( - normalize_pr_json, - parse_datetime, - parse_github_user, -) - -if TYPE_CHECKING: - from github_activity_tracker.models import GitHubRepository - -logger = logging.getLogger(__name__) - - -def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: - """Apply one PR dict (with comments, reviews, assignees, labels) to the database. - Accepts flat or nested { pr_info, comments, reviews } format.""" - pr_data = normalize_pr_json(pr_data) - user_info = parse_github_user(pr_data.get("user")) - if not user_info["account_id"]: - logger.warning( - "PR #%s: no user account_id; skipping", - pr_data.get("number", "?"), - ) - return - account, _ = get_or_create_github_account( - github_account_id=user_info["account_id"], - username=user_info["username"], - display_name=user_info["display_name"], - avatar_url=user_info["avatar_url"], - ) - - pr_obj, _ = services.create_or_update_pull_request( - repo=repo, - account=account, - pr_number=pr_data.get("number"), - pr_id=pr_data.get("id"), - title=pr_data.get("title", ""), - body=pr_data.get("body", ""), - state=pr_data.get("state", "open"), - head_hash=pr_data.get("head", {}).get("sha", ""), - base_hash=pr_data.get("base", {}).get("sha", ""), - pr_created_at=parse_datetime(pr_data.get("created_at")), - pr_updated_at=parse_datetime(pr_data.get("updated_at")), - pr_merged_at=parse_datetime(pr_data.get("merged_at")), - pr_closed_at=parse_datetime(pr_data.get("closed_at")), - ) - - for comment_data in pr_data.get("comments", []): - comment_user_info = parse_github_user(comment_data.get("user")) - if comment_user_info["account_id"]: - comment_account, _ = get_or_create_github_account( - github_account_id=comment_user_info["account_id"], - username=comment_user_info["username"], - display_name=comment_user_info["display_name"], - avatar_url=comment_user_info["avatar_url"], - ) - services.create_or_update_pr_comment( - pr=pr_obj, - account=comment_account, - pr_comment_id=comment_data.get("id"), - body=comment_data.get("body", ""), - pr_comment_created_at=parse_datetime(comment_data.get("created_at")), - pr_comment_updated_at=parse_datetime(comment_data.get("updated_at")), - ) - - for review_data in pr_data.get("reviews", []): - review_user_info = parse_github_user(review_data.get("user")) - if review_user_info["account_id"]: - review_account, _ = get_or_create_github_account( - github_account_id=review_user_info["account_id"], - username=review_user_info["username"], - display_name=review_user_info["display_name"], - avatar_url=review_user_info["avatar_url"], - ) - services.create_or_update_pr_review( - pr=pr_obj, - account=review_account, - pr_review_id=review_data.get("id"), - body=review_data.get("body", ""), - in_reply_to_id=review_data.get("in_reply_to_id"), - pr_review_created_at=parse_datetime(review_data.get("created_at")), - pr_review_updated_at=parse_datetime(review_data.get("updated_at")), - ) - - for assignee_data in pr_data.get("assignees", []): - assignee_info = parse_github_user(assignee_data) - if assignee_info["account_id"]: - assignee_account, _ = get_or_create_github_account( - github_account_id=assignee_info["account_id"], - username=assignee_info["username"], - display_name=assignee_info["display_name"], - avatar_url=assignee_info["avatar_url"], - ) - services.add_pr_assignee(pr_obj, assignee_account) - - for label_data in pr_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_pull_request_label(pr_obj, label_name) - - logger.debug("PR #%s: saved to DB", pr_data.get("number")) - - -def _process_existing_pr_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: - """Load each prs/*.json in workspace for this repo, save to DB, remove file. - - Returns: - (count, pr_numbers) — count of processed files and their PR numbers. - """ - owner = repo.owner_account.username - repo_name = repo.repo_name - count = 0 - numbers: list[int] = [] - for path in iter_existing_pr_jsons(owner, repo_name): - try: - data = json.loads(path.read_text(encoding="utf-8")) - _process_pr_data(repo, data) - save_pr_raw_source(owner, repo_name, data) - path.unlink() - number = (data.get("pr_info") or {}).get("number") or data.get("number") - if number is not None: - numbers.append(number) - count += 1 - except Exception as e: - logger.exception("Failed to process %s: %s", path, e) - return count, numbers - - -def sync_pull_requests( - repo: GitHubRepository, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, -) -> list[int]: - """1) Process existing workspace JSONs; 2) Fetch from GitHub, save as JSON, persist to DB, remove file. - - Args: - repo: Repository to sync. - start_date: Override start date (default: last PR updated_at + 1s, or None if no PRs). - end_date: Override end date (default: None = no end; stable ETag cache). - - Returns: - List of PR numbers processed during this sync run. - """ - logger.info( - "sync_pull_requests: starting for repo id=%s (%s)", - repo.pk, - repo.repo_name, - ) - - owner = repo.owner_account.username - repo_name = repo.repo_name - processed_numbers: list[int] = [] - - try: - # Phase 1: process existing JSON files - n_existing, existing_numbers = _process_existing_pr_jsons(repo) - processed_numbers.extend(existing_numbers) - if n_existing: - logger.info( - "sync_pull_requests: processed %s existing PR JSON(s)", - n_existing, - ) - - # Phase 2: fetch from GitHub, write JSON, persist to DB, remove file - client = get_github_client() - if start_date is None: - last_pr = repo.pull_requests.order_by("-pr_updated_at").first() - if last_pr: - start_date = last_pr.pr_updated_at + timedelta(seconds=1) - # Leave end_date as None when not set so ETag cache semantics stay stable. - - count = 0 - etag_cache = RedisListETagCache(repo_id=repo.pk) - for pr_data in fetcher.fetch_pull_requests_from_github( - client, owner, repo_name, start_date, end_date, etag_cache=etag_cache - ): - pr_number = (pr_data.get("pr_info") or {}).get("number") or pr_data.get( - "number" - ) - if pr_number is None: - continue - json_path = get_pr_json_path(owner, repo_name, pr_number) - json_path.parent.mkdir(parents=True, exist_ok=True) - json_path.write_text( - json.dumps(pr_data, indent=2, default=str), encoding="utf-8" - ) - _process_pr_data(repo, pr_data) - save_pr_raw_source(owner, repo_name, pr_data) - json_path.unlink() - processed_numbers.append(pr_number) - count += 1 - - logger.info( - "sync_pull_requests: finished for repo id=%s; %s existing + %s fetched", - repo.pk, - n_existing, - count, - ) - - except (RateLimitException, ConnectionException) as e: - logger.error("sync_pull_requests: failed for repo id=%s: %s", repo.pk, e) - raise - except Exception as e: - logger.exception( - "sync_pull_requests: unexpected error for repo id=%s: %s", - repo.pk, - e, - ) - raise - - return processed_numbers diff --git a/github_activity_tracker/tests/test_client_link_parsing.py b/github_activity_tracker/tests/test_client_link_parsing.py new file mode 100644 index 00000000..fd10c368 --- /dev/null +++ b/github_activity_tracker/tests/test_client_link_parsing.py @@ -0,0 +1,57 @@ +"""Tests for GitHubAPIClient Link header parsing methods.""" + +from github_ops.client import GitHubAPIClient + + +def test_parse_link_rels_parses_all_rels(): + """_parse_link_rels returns dict with all rel→url pairs from Link header.""" + link_header = ( + '; rel="next", ' + '; rel="last", ' + '; rel="first"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "next": "https://api.github.com/repos/o/r/commits?page=2", + "last": "https://api.github.com/repos/o/r/commits?page=50", + "first": "https://api.github.com/repos/o/r/commits?page=1", + } + + +def test_parse_link_rels_handles_prev_rel(): + """_parse_link_rels includes prev rel when present.""" + link_header = ( + '; rel="prev", ' + '; rel="first"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "prev": "https://api.github.com/repos/o/r/commits?page=49", + "first": "https://api.github.com/repos/o/r/commits?page=1", + } + + +def test_parse_link_rels_returns_empty_dict_when_no_header(): + """_parse_link_rels returns empty dict when Link header is None or empty.""" + assert GitHubAPIClient._parse_link_rels(None) == {} + assert GitHubAPIClient._parse_link_rels("") == {} + + +def test_parse_link_rels_handles_single_rel(): + """_parse_link_rels works with a single rel in the header.""" + link_header = '; rel="next"' + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == {"next": "https://api.github.com/repos/o/r/commits?page=2"} + + +def test_parse_link_rels_handles_github_repository_id_format(): + """_parse_link_rels handles GitHub's /repositories/{id}/commits format.""" + link_header = ( + '; rel="first", ' + '; rel="prev"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "first": "https://api.github.com/repositories/7590028/commits?per_page=100&page=1", + "prev": "https://api.github.com/repositories/7590028/commits?per_page=100&page=522", + } diff --git a/github_activity_tracker/tests/test_fetcher.py b/github_activity_tracker/tests/test_fetcher.py index 569fb314..22571a74 100644 --- a/github_activity_tracker/tests/test_fetcher.py +++ b/github_activity_tracker/tests/test_fetcher.py @@ -2,14 +2,12 @@ import pytest from datetime import datetime, timezone -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock from github_activity_tracker.fetcher import ( fetch_comments_from_github, fetch_commits_from_github, - fetch_issues_from_github, fetch_pr_reviews_from_github, - fetch_pull_requests_from_github, fetch_user_from_github, ) @@ -71,19 +69,16 @@ def test_fetch_user_from_github_returns_none_when_empty_response(): def test_fetch_commits_from_github_yields_commit_dicts(): """fetch_commits_from_github yields full commit dict from /repos/.../commits/{sha}.""" client = MagicMock() - client.rest_request.side_effect = [ - [ - { - "sha": "abc", - "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}, - } - ], - { - "sha": "abc", - "commit": {"message": "msg"}, - "stats": {"additions": 1}, - }, - ] + # New API: rest_request_with_all_links returns (data, links_dict) + client.rest_request_with_all_links.return_value = ( + [{"sha": "abc", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {}, # No links = single page + ) + client.rest_request.return_value = { + "sha": "abc", + "commit": {"message": "msg"}, + "stats": {"additions": 1}, + } items = list(fetch_commits_from_github(client, "o", "r")) assert len(items) == 1 assert items[0]["sha"] == "abc" @@ -93,84 +88,71 @@ def test_fetch_commits_from_github_yields_commit_dicts(): def test_fetch_commits_from_github_stops_on_empty_page(): """fetch_commits_from_github stops when API returns empty list.""" client = MagicMock() - client.rest_request.return_value = [] + client.rest_request_with_all_links.return_value = ([], {}) items = list(fetch_commits_from_github(client, "owner", "repo")) assert items == [] - client.rest_request.assert_called_once() def test_fetch_commits_from_github_includes_since_until_params(): """fetch_commits_from_github passes since/until when start_time/end_time given.""" client = MagicMock() - client.rest_request.return_value = [] + client.rest_request_with_all_links.return_value = ([], {}) start = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 12, 31, tzinfo=timezone.utc) list(fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end)) - call_args = client.rest_request.call_args - params = call_args[0][1] or {} + call_args = client.rest_request_with_all_links.call_args + # params is the second positional argument + params = call_args[0][1] if len(call_args[0]) > 1 else call_args[1]["params"] assert "since" in params assert "until" in params def test_fetch_commits_from_github_with_etag_cache_304_yields_nothing(): - """When etag_cache is passed and rest_request_conditional returns 304, page is skipped - and next page is requested; when next page returns empty, no items yielded and set not called. - """ + """When etag_cache is passed and rest_request_conditional_with_all_links returns 304, nothing is yielded.""" client = MagicMock() - # Page 1: 304 -> skip; page 2: empty -> break. No items, no etag_cache.set. - client.rest_request_conditional.side_effect = [ - (None, 'W/"cached"'), # page 1: 304 - ([], None), # page 2: empty, stops loop - ] + # Page 1: 304 -> return immediately (new behavior) + client.rest_request_conditional_with_all_links.return_value = ( + None, + 'W/"cached"', + {}, + ) etag_cache = MagicMock() etag_cache.get.return_value = 'W/"cached"' - with patch("github_activity_tracker.fetcher.time.sleep"): - items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + assert items == [] - assert client.rest_request_conditional.call_count == 2 - # Ensure we requested page 1 then page 2 (no re-requesting the same page). - call1_params = client.rest_request_conditional.call_args_list[0][1]["params"] - call2_params = client.rest_request_conditional.call_args_list[1][1]["params"] - assert call1_params["page"] == 1 - assert call2_params["page"] == 2 + client.rest_request_conditional_with_all_links.assert_called_once() etag_cache.set.assert_not_called() def test_fetch_commits_from_github_with_etag_cache_200_yields_and_sets(): - """When etag_cache is passed and rest_request_conditional returns 200, yields items and calls set - only after the page's items have been consumed (checkpoint deferred). - """ + """When etag_cache is passed and rest_request_conditional_with_all_links returns 200, yields items and calls set.""" client = MagicMock() - # Two items on page 1 so we can assert set() is not called until after both are consumed. - client.rest_request_conditional.side_effect = [ - ( - [ - {"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}, - {"sha": "def", "commit": {"author": {"date": "2024-06-02T00:00:00Z"}}}, - ], - "W/new_etag", - ), - ] + # Single page with two commits (newest first from API, yielded oldest first) + client.rest_request_conditional_with_all_links.return_value = ( + [ + {"sha": "def", "commit": {"author": {"date": "2024-06-02T00:00:00Z"}}}, + {"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}, + ], + "W/new_etag", + {}, # No links = single page + ) client.rest_request.side_effect = [ {"sha": "abc", "commit": {"message": "msg"}, "stats": {"additions": 1}}, {"sha": "def", "commit": {"message": "msg2"}, "stats": {"additions": 2}}, ] etag_cache = MagicMock() etag_cache.get.return_value = None - with patch("github_activity_tracker.fetcher.time.sleep"): - gen = fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache) - # Consume first item only; checkpoint must not be written yet. - first = next(gen) - etag_cache.set.assert_not_called() - # Consume second item; set still not called until we advance past the last yield. - second = next(gen) - etag_cache.set.assert_not_called() - # Advancing again runs the code after the for-loop (etag_cache.set) then exits. - with pytest.raises(StopIteration): - next(gen) - etag_cache.set.assert_called_once() - assert first["sha"] == "abc" - assert second["sha"] == "def" + + items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + # Should yield oldest first: abc, def + assert len(items) == 2 + assert items[0]["sha"] == "abc" + assert items[1]["sha"] == "def" + # ETag should be cached after processing + etag_cache.set.assert_called_once() call_args = etag_cache.set.call_args[0] assert call_args[0] == "commits" assert call_args[1] == 1 @@ -182,21 +164,18 @@ def test_fetch_commits_from_github_aborts_on_502_503_504(): import requests as req client = MagicMock() - # API returns commits (e.g. newest first); fetcher iterates reversed(), so first - # full-commit fetch is for the last in this list (def456). That fetch returns 502 → abort. - client.rest_request.side_effect = [ + # Single page with commits (newest first from API) + client.rest_request_with_all_links.return_value = ( [ - { - "sha": "abc123", - "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}, - }, - { - "sha": "def456", - "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}, - }, + {"sha": "def456", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "abc123", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, ], - req.exceptions.HTTPError("Bad Gateway", response=MagicMock(status_code=502)), - ] + {}, + ) + # First detail fetch (for abc123, oldest) returns 502 + client.rest_request.side_effect = req.exceptions.HTTPError( + "Bad Gateway", response=MagicMock(status_code=502) + ) with pytest.raises(req.exceptions.HTTPError): list(fetch_commits_from_github(client, "o", "r")) @@ -206,20 +185,19 @@ def test_fetch_commits_from_github_5xx_with_etag_cache_does_not_checkpoint(): import requests as req client = MagicMock() - client.rest_request_conditional.side_effect = [ - ( - [{"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}], - "W/new_etag", - ), - ] + client.rest_request_conditional_with_all_links.return_value = ( + [{"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}], + "W/new_etag", + {}, + ) client.rest_request.side_effect = req.exceptions.HTTPError( "Bad Gateway", response=MagicMock(status_code=502) ) etag_cache = MagicMock() etag_cache.get.return_value = None - with patch("github_activity_tracker.fetcher.time.sleep"): - with pytest.raises(req.exceptions.HTTPError): - list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + with pytest.raises(req.exceptions.HTTPError): + list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) etag_cache.set.assert_not_called() @@ -228,10 +206,13 @@ def test_fetch_commits_from_github_reraises_non_server_error_http(): import requests as req client = MagicMock() - client.rest_request.side_effect = [ + client.rest_request_with_all_links.return_value = ( [{"sha": "abc", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], - req.exceptions.HTTPError("Forbidden", response=MagicMock(status_code=403)), - ] + {}, + ) + client.rest_request.side_effect = req.exceptions.HTTPError( + "Forbidden", response=MagicMock(status_code=403) + ) with pytest.raises(req.exceptions.HTTPError): list(fetch_commits_from_github(client, "o", "r")) @@ -269,56 +250,6 @@ def test_fetch_comments_from_github_calls_correct_endpoint(): assert "/repos/owner/repo/issues/42/comments" in client.rest_request.call_args[0][0] -# --- fetch_issues_from_github --- - - -def test_fetch_issues_from_github_yields_issue_dicts(): - """fetch_issues_from_github yields nested { issue_info, comments } dicts.""" - client = MagicMock() - # First page via Link-header API (list + next_url); then full issue GET; then comments - client.rest_request_with_link.return_value = ( - [{"number": 1, "title": "Issue 1", "updated_at": "2024-06-01T00:00:00Z"}], - None, - ) - client.rest_request.side_effect = [ - {"number": 1, "title": "Issue 1", "updated_at": "2024-06-01T00:00:00Z"}, - [], # comments for issue 1 - ] - items = list(fetch_issues_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["issue_info"]["number"] == 1 - assert "comments" in items[0] - assert items[0]["comments"] == [] - - -def test_fetch_issues_from_github_filters_out_pulls(): - """fetch_issues_from_github filters out items that have pull_request key.""" - client = MagicMock() - client.rest_request_with_link.return_value = ( - [ - {"number": 1, "pull_request": {}}, - {"number": 2, "updated_at": "2024-06-01T00:00:00Z"}, - ], - None, - ) - client.rest_request.side_effect = [ - {"number": 2, "updated_at": "2024-06-01T00:00:00Z"}, # full issue for #2 - [], # comments for issue 2 - ] - items = list(fetch_issues_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["issue_info"]["number"] == 2 - - -def test_fetch_issues_from_github_stops_on_empty_page(): - """fetch_issues_from_github stops when API returns empty list.""" - client = MagicMock() - client.rest_request_with_link.return_value = ([], None) - items = list(fetch_issues_from_github(client, "owner", "repo")) - assert items == [] - client.rest_request.assert_not_called() - - # --- fetch_pr_reviews_from_github --- @@ -348,53 +279,3 @@ def test_fetch_pr_reviews_from_github_calls_pulls_comments(): fetch_pr_reviews_from_github(client, "owner", "repo", pr_number=3) client.rest_request.assert_called_once() assert "/repos/owner/repo/pulls/3/comments" in client.rest_request.call_args[0][0] - - -# --- fetch_pull_requests_from_github --- - - -def test_fetch_pull_requests_from_github_yields_pr_dicts(): - """fetch_pull_requests_from_github yields nested { pr_info, comments, reviews } dicts.""" - client = MagicMock() - client.rest_request.side_effect = [ - [ - { - "number": 1, - "updated_at": "2024-06-01T00:00:00Z", - "created_at": "2024-05-01T00:00:00Z", - }, - ], - { - "number": 1, - "updated_at": "2024-06-01T00:00:00Z", - "created_at": "2024-05-01T00:00:00Z", - }, # full PR - [], # comments for PR 1 - [], # reviews for PR 1 - ] - items = list(fetch_pull_requests_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["pr_info"]["number"] == 1 - assert "comments" in items[0] - assert "reviews" in items[0] - assert items[0]["comments"] == [] - assert items[0]["reviews"] == [] - - -def test_fetch_pull_requests_from_github_stops_on_empty_page(): - """fetch_pull_requests_from_github stops when API returns empty list.""" - client = MagicMock() - client.rest_request.return_value = [] - items = list(fetch_pull_requests_from_github(client, "owner", "repo")) - assert items == [] - - -def test_fetch_pull_requests_from_github_calls_correct_endpoint(): - """fetch_pull_requests_from_github calls .../pulls with state=all.""" - client = MagicMock() - client.rest_request.return_value = [] - list(fetch_pull_requests_from_github(client, "owner", "repo")) - call_args = client.rest_request.call_args - assert "/repos/owner/repo/pulls" in call_args[0][0] - params = call_args[0][1] or {} - assert params["state"] == "all" diff --git a/github_activity_tracker/tests/test_fetcher_commits_backward.py b/github_activity_tracker/tests/test_fetcher_commits_backward.py new file mode 100644 index 00000000..35addf48 --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_commits_backward.py @@ -0,0 +1,182 @@ +"""Tests for fetch_commits_from_github backward pagination (oldest→newest).""" + +from datetime import datetime, timezone +from unittest.mock import MagicMock + +from github_activity_tracker.fetcher import fetch_commits_from_github + + +def test_fetch_commits_single_page_yields_oldest_first(): + """fetch_commits_from_github with single page yields commits in reverse (oldest first).""" + client = MagicMock() + # Page 1 has no "last" link (single page) + client.rest_request_with_all_links.return_value = ( + [ + {"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}, + {"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, + ], + {}, # No links = single page + ) + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield oldest→newest: c1, c2, c3 + assert [c["sha"] for c in commits] == ["c1", "c2", "c3"] + + +def test_fetch_commits_next_without_last_forward_pagination(): + """When rel=last is omitted but rel=next is present, follow next for all pages.""" + client = MagicMock() + + client.rest_request_with_all_links.return_value = ( + [{"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}], + {"next": "https://api.github.com/repos/o/r/commits?page=2"}, + ) + + client.rest_request_url_with_all_links.side_effect = [ + ( + [{"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}], + {"next": "https://api.github.com/repos/o/r/commits?page=3"}, + ), + ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {}, + ), + ] + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + assert [c["sha"] for c in commits] == ["c1", "c2", "c3"] + assert client.rest_request_url_with_all_links.call_count == 2 + + +def test_fetch_commits_multiple_pages_backward_traversal(): + """fetch_commits_from_github walks backward from last page to first.""" + client = MagicMock() + + # Page 1: has "last" link pointing to page 3 + client.rest_request_with_all_links.return_value = ( + [{"sha": "c9", "commit": {"author": {"date": "2024-01-09T00:00:00Z"}}}], + { + "next": "https://api.github.com/repos/o/r/commits?page=2", + "last": "https://api.github.com/repos/o/r/commits?page=3", + }, + ) + + # Page 3 (last): has "prev" pointing to page 2 + # Page 2: has "prev" pointing to page 1 + client.rest_request_url_with_all_links.side_effect = [ + # Page 3 + ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + { + "prev": "https://api.github.com/repos/o/r/commits?page=2", + "first": "https://api.github.com/repos/o/r/commits?page=1", + }, + ), + # Page 2 + ( + [{"sha": "c5", "commit": {"author": {"date": "2024-01-05T00:00:00Z"}}}], + { + "prev": "https://api.github.com/repos/o/r/commits?page=1", + "first": "https://api.github.com/repos/o/r/commits?page=1", + }, + ), + # Page 1 is cached, not fetched again + ] + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield oldest→newest: c1 (page 3), c5 (page 2), c9 (page 1 cached) + assert [c["sha"] for c in commits] == ["c1", "c5", "c9"] + # Page 1 should NOT be fetched again via rest_request_url_with_all_links + assert client.rest_request_url_with_all_links.call_count == 2 + + +def test_fetch_commits_caches_first_page(): + """fetch_commits_from_github reuses cached page 1 data when prev returns to page 1.""" + client = MagicMock() + + # Page 1 + page1_data = [ + {"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}, + ] + client.rest_request_with_all_links.return_value = ( + page1_data, + {"last": "https://api.github.com/repos/o/r/commits?page=2"}, + ) + + # Page 2 (last): prev points back to page 1 + client.rest_request_url_with_all_links.return_value = ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {"prev": "https://api.github.com/repos/o/r/commits?page=1"}, + ) + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield c1 (page 2), c3 (page 1 from cache) + assert [c["sha"] for c in commits] == ["c1", "c3"] + # rest_request_url_with_all_links called only once for page 2 + assert client.rest_request_url_with_all_links.call_count == 1 + + +def test_fetch_commits_filters_by_date_range(): + """fetch_commits_from_github filters commits outside start_time/end_time.""" + client = MagicMock() + client.rest_request_with_all_links.return_value = ( + [ + {"sha": "c4", "commit": {"author": {"date": "2024-01-04T00:00:00Z"}}}, + {"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, + ], + {}, + ) + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + start = datetime(2024, 1, 2, tzinfo=timezone.utc) + end = datetime(2024, 1, 3, tzinfo=timezone.utc) + commits = list( + fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end) + ) + + # Only c2 is in range [2024-01-02, 2024-01-03] + assert [c["sha"] for c in commits] == ["c2"] + + +def test_fetch_commits_handles_304_not_modified(): + """fetch_commits_from_github returns immediately on 304 when using etag_cache.""" + client = MagicMock() + etag_cache = MagicMock() + etag_cache.get.return_value = "abc123" + + # 304 response: data is None + client.rest_request_conditional_with_all_links.return_value = (None, "abc123", {}) + + commits = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + assert commits == [] + # Should not attempt to paginate + client.rest_request_url_with_all_links.assert_not_called() diff --git a/github_activity_tracker/tests/test_fetcher_date_helpers.py b/github_activity_tracker/tests/test_fetcher_date_helpers.py new file mode 100644 index 00000000..0dbd638b --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_date_helpers.py @@ -0,0 +1,106 @@ +"""Tests for fetcher date range helper functions.""" + +from datetime import datetime, timezone + +from github_activity_tracker.fetcher import _make_aware, _in_date_range + + +def test_make_aware_converts_naive_to_utc(): + """_make_aware converts naive datetime to UTC-aware.""" + naive = datetime(2024, 1, 1, 12, 0, 0) + result = _make_aware(naive) + assert result.tzinfo == timezone.utc + assert result.year == 2024 + assert result.month == 1 + assert result.day == 1 + + +def test_make_aware_preserves_aware_datetime(): + """_make_aware returns aware datetime as-is.""" + aware = datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + result = _make_aware(aware) + assert result is aware + + +def test_make_aware_preserves_non_utc_aware_datetime(): + """_make_aware returns non-UTC aware datetime as-is (does not convert to UTC).""" + from datetime import timedelta + + # Create a datetime in UTC+5 + utc_plus_5 = timezone(timedelta(hours=5)) + dt = datetime(2024, 1, 1, 12, 0, 0, tzinfo=utc_plus_5) + result = _make_aware(dt) + + # _make_aware returns aware datetimes as-is; it doesn't convert to UTC + assert result is dt + assert result.tzinfo == utc_plus_5 + + +def test_in_date_range_returns_true_when_in_range(): + """_in_date_range returns True when dt is within [start_time, end_time].""" + dt = datetime(2024, 1, 5, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is True + + +def test_in_date_range_returns_false_when_before_start(): + """_in_date_range returns False when dt is before start_time.""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + start = datetime(2024, 1, 5, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is False + + +def test_in_date_range_returns_false_when_after_end(): + """_in_date_range returns False when dt is after end_time.""" + dt = datetime(2024, 1, 15, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is False + + +def test_in_date_range_returns_true_when_no_start_time(): + """_in_date_range returns True when start_time is None (no lower bound).""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, None, end) is True + + +def test_in_date_range_returns_true_when_no_end_time(): + """_in_date_range returns True when end_time is None (no upper bound).""" + dt = datetime(2024, 1, 15, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, None) is True + + +def test_in_date_range_returns_true_when_no_bounds(): + """_in_date_range returns True when both start_time and end_time are None.""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + + assert _in_date_range(dt, None, None) is True + + +def test_in_date_range_handles_naive_start_and_end(): + """_in_date_range handles naive start_time and end_time by assuming UTC.""" + dt = datetime(2024, 1, 5, tzinfo=timezone.utc) + start = datetime(2024, 1, 1) # Naive + end = datetime(2024, 1, 10) # Naive + + assert _in_date_range(dt, start, end) is True + + +def test_in_date_range_inclusive_boundaries(): + """_in_date_range is inclusive on both boundaries.""" + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + # Exactly at start + assert _in_date_range(start, start, end) is True + # Exactly at end + assert _in_date_range(end, start, end) is True diff --git a/github_activity_tracker/tests/test_fetcher_issues_and_prs.py b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py new file mode 100644 index 00000000..f75be7e9 --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py @@ -0,0 +1,180 @@ +"""Tests for fetch_issues_and_prs_from_github unified fetcher.""" + +from datetime import datetime, timezone +from unittest.mock import MagicMock + +from github_activity_tracker.fetcher import fetch_issues_and_prs_from_github + + +def test_fetch_issues_and_prs_routes_issue_correctly(): + """fetch_issues_and_prs_from_github yields issue with issue_info key when no pull_request key.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + { + "number": 1, + "updated_at": "2024-01-01T00:00:00Z", + "title": "Bug", + } + ], + None, # No next page + ) + client.rest_request.side_effect = [ + {"number": 1, "title": "Bug", "body": "Full issue"}, # Full issue detail + [], # Comments + ] + + items = list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + assert len(items) == 1 + assert "issue_info" in items[0] + assert "pr_info" not in items[0] + assert items[0]["issue_info"]["number"] == 1 + assert items[0]["comments"] == [] + + +def test_fetch_issues_and_prs_routes_pr_correctly(): + """fetch_issues_and_prs_from_github yields PR with pr_info key when pull_request key present.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + { + "number": 2, + "updated_at": "2024-01-02T00:00:00Z", + "title": "Feature", + "pull_request": {"url": "https://api.github.com/repos/o/r/pulls/2"}, + } + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 2, "title": "Feature", "body": "Full PR"}, # Full PR detail + [], # Comments + [], # Reviews + ] + + items = list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + assert len(items) == 1 + assert "pr_info" in items[0] + assert "issue_info" not in items[0] + assert items[0]["pr_info"]["number"] == 2 + assert items[0]["comments"] == [] + assert items[0]["reviews"] == [] + + +def test_fetch_issues_and_prs_fetches_both_in_one_call(): + """fetch_issues_and_prs_from_github processes both issues and PRs from single /issues list.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + {"number": 1, "updated_at": "2024-01-01T00:00:00Z", "title": "Issue"}, + { + "number": 2, + "updated_at": "2024-01-02T00:00:00Z", + "title": "PR", + "pull_request": {"url": "..."}, + }, + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 1, "title": "Issue"}, # Issue detail + [], # Issue comments + {"number": 2, "title": "PR"}, # PR detail + [], # PR comments + [], # PR reviews + ] + + items = list(fetch_issues_and_prs_from_github(client, "o", "r")) + + assert len(items) == 2 + assert "issue_info" in items[0] + assert "pr_info" in items[1] + + +def test_fetch_issues_and_prs_uses_direction_asc(): + """fetch_issues_and_prs_from_github requests items with direction=asc (oldest first).""" + client = MagicMock() + client.rest_request_with_link.return_value = ([], None) + + list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + # Check the params argument (second positional arg) + call_args = client.rest_request_with_link.call_args + params = call_args[0][1] if len(call_args[0]) > 1 else call_args[1].get("params") + assert params["direction"] == "asc" + assert params["sort"] == "updated" + + +def test_fetch_issues_and_prs_filters_by_date_range(): + """fetch_issues_and_prs_from_github filters items outside start_time/end_time.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + {"number": 1, "updated_at": "2024-01-01T00:00:00Z"}, + {"number": 2, "updated_at": "2024-01-05T00:00:00Z"}, + {"number": 3, "updated_at": "2024-01-10T00:00:00Z"}, + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 2}, # Only #2 in range + [], # Comments + ] + + start = datetime(2024, 1, 2, tzinfo=timezone.utc) + end = datetime(2024, 1, 8, tzinfo=timezone.utc) + items = list( + fetch_issues_and_prs_from_github( + client, "o", "r", start_time=start, end_time=end + ) + ) + + assert len(items) == 1 + assert items[0]["issue_info"]["number"] == 2 + + +def test_fetch_issues_and_prs_paginates_with_link_header(): + """fetch_issues_and_prs_from_github follows Link rel=next for pagination.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [{"number": 1, "updated_at": "2024-01-01T00:00:00Z"}], + "https://api.github.com/page=2", + ) + client.rest_request_url.return_value = ( + [{"number": 2, "updated_at": "2024-01-02T00:00:00Z"}], + None, + ) + client.rest_request.side_effect = [ + {"number": 1}, + [], + {"number": 2}, + [], + ] + + items = list(fetch_issues_and_prs_from_github(client, "o", "r")) + + assert len(items) == 2 + client.rest_request_url.assert_called_once_with("https://api.github.com/page=2") + + +def test_fetch_issues_and_prs_handles_304_not_modified(): + """fetch_issues_and_prs_from_github skips page on 304 when using etag_cache.""" + client = MagicMock() + etag_cache = MagicMock() + etag_cache.get.return_value = "etag123" + + # First page: 304, second page: empty (end of pagination) + client.rest_request_conditional_with_link.side_effect = [ + (None, "etag123", None), # Page 1: 304 + ([], "new_etag", None), # Page 2: empty list (stops pagination) + ] + + items = list( + fetch_issues_and_prs_from_github(client, "o", "r", etag_cache=etag_cache) + ) + + assert items == [] + # Should have tried page 1 (304) and page 2 (empty) + assert client.rest_request_conditional_with_link.call_count == 2 diff --git a/github_activity_tracker/tests/test_sync.py b/github_activity_tracker/tests/test_sync.py index a79b847a..55ec71b5 100644 --- a/github_activity_tracker/tests/test_sync.py +++ b/github_activity_tracker/tests/test_sync.py @@ -7,35 +7,47 @@ def test_sync_github_passes_start_date_end_date_to_sync_modules(): - """sync_github forwards start_date and end_date to sync_commits, sync_issues, sync_pull_requests.""" + """sync_github forwards start_date and end_date to sync_commits and sync_issues_and_prs.""" mock_repo = MagicMock() start = datetime(2024, 1, 1) end = datetime(2024, 12, 31) with patch("github_activity_tracker.sync.sync_repos") as m_repos, patch( "github_activity_tracker.sync.sync_commits" ) as m_commits, patch( - "github_activity_tracker.sync.sync_issues" - ) as m_issues, patch( - "github_activity_tracker.sync.sync_pull_requests" - ) as m_prs: - sync_github(mock_repo, start_date=start, end_date=end) + "github_activity_tracker.sync.sync_issues_and_prs" + ) as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [], "pull_requests": []} + result = sync_github(mock_repo, start_date=start, end_date=end) + m_repos.assert_called_once_with(mock_repo) m_commits.assert_called_once_with(mock_repo, start_date=start, end_date=end) - m_issues.assert_called_once_with(mock_repo, start_date=start, end_date=end) - m_prs.assert_called_once_with(mock_repo, start_date=start, end_date=end) + m_issues_and_prs.assert_called_once_with(mock_repo, start_date=start, end_date=end) + assert result == {"issues": [], "pull_requests": []} def test_sync_github_calls_sync_without_dates_when_none(): - """sync_github calls sync_commits/issues/pull_requests with start_date and end_date None when not provided.""" + """sync_github calls sync_commits and sync_issues_and_prs with start_date and end_date None when not provided.""" mock_repo = MagicMock() with patch("github_activity_tracker.sync.sync_repos"), patch( "github_activity_tracker.sync.sync_commits" ) as m_commits, patch( - "github_activity_tracker.sync.sync_issues" - ) as m_issues, patch( - "github_activity_tracker.sync.sync_pull_requests" - ) as m_prs: - sync_github(mock_repo) + "github_activity_tracker.sync.sync_issues_and_prs" + ) as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [1, 2], "pull_requests": [3]} + result = sync_github(mock_repo) + m_commits.assert_called_once_with(mock_repo, start_date=None, end_date=None) - m_issues.assert_called_once_with(mock_repo, start_date=None, end_date=None) - m_prs.assert_called_once_with(mock_repo, start_date=None, end_date=None) + m_issues_and_prs.assert_called_once_with(mock_repo, start_date=None, end_date=None) + assert result == {"issues": [1, 2], "pull_requests": [3]} + + +def test_sync_github_returns_issues_and_prs_dict(): + """sync_github returns dict with issues and pull_requests keys from sync_issues_and_prs.""" + mock_repo = MagicMock() + with patch("github_activity_tracker.sync.sync_repos"), patch( + "github_activity_tracker.sync.sync_commits" + ), patch("github_activity_tracker.sync.sync_issues_and_prs") as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [10, 20], "pull_requests": [30, 40]} + result = sync_github(mock_repo) + + assert result == {"issues": [10, 20], "pull_requests": [30, 40]} diff --git a/github_activity_tracker/tests/test_sync_issues_and_prs.py b/github_activity_tracker/tests/test_sync_issues_and_prs.py new file mode 100644 index 00000000..aca2c338 --- /dev/null +++ b/github_activity_tracker/tests/test_sync_issues_and_prs.py @@ -0,0 +1,200 @@ +"""Tests for sync_issues_and_prs unified sync function.""" + +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch + +from github_activity_tracker.sync.issues_and_prs import ( + sync_issues_and_prs, +) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_processes_both_types( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs routes items by key to issue or PR processing.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + + # Yield one issue and one PR + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ + {"issue_info": {"number": 1}, "comments": []}, + {"pr_info": {"number": 2}, "comments": [], "reviews": []}, + ] + + with patch( + "github_activity_tracker.sync.issues_and_prs._process_issue_data" + ) as mock_proc_issue, patch( + "github_activity_tracker.sync.issues_and_prs._process_pr_data" + ) as mock_proc_pr, patch( + "github_activity_tracker.sync.issues_and_prs.save_issue_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.save_pr_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.get_issue_json_path" + ) as mock_issue_path, patch( + "github_activity_tracker.sync.issues_and_prs.get_pr_json_path" + ) as mock_pr_path: + + mock_issue_path.return_value = MagicMock() + mock_pr_path.return_value = MagicMock() + + result = sync_issues_and_prs(mock_repo) + + assert result == {"issues": [1], "pull_requests": [2]} + mock_proc_issue.assert_called_once() + mock_proc_pr.assert_called_once() + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_uses_max_start_date( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs uses the later of last_issue and last_pr (+1s) as start_date.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + + # Last issue updated at 2024-01-05 + mock_last_issue = MagicMock() + mock_last_issue.issue_updated_at = datetime(2024, 1, 5, tzinfo=timezone.utc) + mock_repo.issues.order_by.return_value.first.return_value = mock_last_issue + + # Last PR updated at 2024-01-03 (older than last issue) + mock_last_pr = MagicMock() + mock_last_pr.pr_updated_at = datetime(2024, 1, 3, tzinfo=timezone.utc) + mock_repo.pull_requests.order_by.return_value.first.return_value = mock_last_pr + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + sync_issues_and_prs(mock_repo) + + # Should use max(issue_date, pr_date) → 2024-01-05 + 1s + call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args + start_date = call_args[0][3] # Fourth positional arg + assert start_date == datetime(2024, 1, 5, 0, 0, 1, tzinfo=timezone.utc) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_processes_existing_jsons_first( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs processes leftover JSON files before fetching from GitHub.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + # Existing JSONs found + mock_existing_issues.return_value = (2, [10, 11]) + mock_existing_prs.return_value = (1, [20]) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + result = sync_issues_and_prs(mock_repo) + + # Should include existing numbers in result + assert 10 in result["issues"] + assert 11 in result["issues"] + assert 20 in result["pull_requests"] + mock_existing_issues.assert_called_once_with(mock_repo) + mock_existing_prs.assert_called_once_with(mock_repo) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_respects_override_start_date( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs uses provided start_date instead of deriving from DB.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + override_start = datetime(2023, 1, 1, tzinfo=timezone.utc) + sync_issues_and_prs(mock_repo, start_date=override_start) + + # Should NOT query DB for last issue/PR + mock_repo.issues.order_by.assert_not_called() + mock_repo.pull_requests.order_by.assert_not_called() + + # Should pass override_start to fetcher + call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args + assert call_args[0][3] == override_start + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_saves_and_removes_json_files( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs writes JSON, processes, then removes file for each item.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ + {"issue_info": {"number": 1}, "comments": []}, + ] + + mock_json_path = MagicMock() + + with patch( + "github_activity_tracker.sync.issues_and_prs._process_issue_data" + ), patch( + "github_activity_tracker.sync.issues_and_prs.save_issue_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.get_issue_json_path", + return_value=mock_json_path, + ): + + sync_issues_and_prs(mock_repo) + + # Should write, then unlink + mock_json_path.parent.mkdir.assert_called_once() + mock_json_path.write_text.assert_called_once() + mock_json_path.unlink.assert_called_once() diff --git a/github_ops/client.py b/github_ops/client.py index 3a73a897..610a8fa3 100644 --- a/github_ops/client.py +++ b/github_ops/client.py @@ -400,6 +400,21 @@ def rest_request_conditional_with_link( next_url = self._parse_link_next(response.headers.get("Link")) return (response.json(), response_etag, next_url) + def rest_request_conditional_with_all_links( + self, + endpoint: str, + params: Optional[dict] = None, + etag: Optional[str] = None, + ) -> tuple[Optional[Union[list, dict]], Optional[str], dict[str, str]]: + """Like rest_request_conditional but returns all Link rels as a dict. + Returns (data, response_etag, links_dict). On 304: (None, etag, {}). + """ + response, response_etag = self._rest_get(endpoint, params=params, etag=etag) + if response is None: + return (None, response_etag, {}) + links = self._parse_link_rels(response.headers.get("Link")) + return (response.json(), response_etag, links) + @staticmethod def _parse_link_next(link_header: Optional[str]) -> Optional[str]: """Parse GitHub Link response header; return URL for rel=\"next\" or None. @@ -410,6 +425,42 @@ def _parse_link_next(link_header: Optional[str]) -> Optional[str]: match = re.search(r'<([^>]+)>;\s*rel="next"', link_header) return match.group(1) if match else None + @staticmethod + def _parse_link_rels(link_header: Optional[str]) -> dict[str, str]: + """Parse GitHub Link response header; return a dict of all rel→url pairs. + Example: {"next": "https://...", "last": "https://...", "prev": "https://..."} + """ + if not link_header: + return {} + return { + rel: url + for url, rel in re.findall(r'<([^>]+)>;\s*rel="([^"]+)"', link_header) + } + + def rest_request_with_all_links( + self, endpoint: str, params: Optional[dict] = None + ) -> tuple[Union[list, dict], dict[str, str]]: + """GET request that returns (data, links_dict) with all Link rels. + links_dict keys include "next", "prev", "last", "first" when present. + """ + response, _ = self._rest_get(endpoint, params=params) + if response is None: + return ({}, {}) + data = response.json() + links = self._parse_link_rels(response.headers.get("Link")) + return (data, links) + + def rest_request_url_with_all_links( + self, full_url: str + ) -> tuple[Union[list, dict], dict[str, str]]: + """GET full_url (e.g. from Link header) and return (data, links_dict) with all rels. + Uses same session (auth, rate limit). For paginated backward/forward traversal. + """ + response = self._rest_get_url(full_url) + data = response.json() + links = self._parse_link_rels(response.headers.get("Link")) + return (data, links) + def rest_request_with_link( self, endpoint: str, params: Optional[dict] = None ) -> tuple[Union[list, dict], Optional[str]]: diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 38a16fd5..307592bd 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -9,6 +9,7 @@ import base64 import logging import os +import random import re import subprocess import threading @@ -26,9 +27,14 @@ logger = logging.getLogger(__name__) # Fewer workers to avoid GitHub secondary rate limit (403 when too many concurrent requests) -_UPLOAD_FOLDER_MAX_WORKERS = 8 +_UPLOAD_FOLDER_MAX_WORKERS = 4 _UPLOAD_FOLDER_BLOB_RETRIES = 5 -_UPLOAD_FOLDER_403_WAIT_SEC = 60 +# Cap concurrent blob POSTs across all executor threads (primary + secondary limit relief) +_UPLOAD_FOLDER_BLOB_MAX_CONCURRENT = 3 +# Max seconds to sleep in one wait after 403 (avoid unbounded sleeps from bad headers) +_UPLOAD_FOLDER_403_MAX_SLEEP_SEC = 900 + +_blob_post_semaphore = threading.BoundedSemaphore(_UPLOAD_FOLDER_BLOB_MAX_CONCURRENT) _thread_local = threading.local() @@ -50,6 +56,38 @@ def _get_worker_session(token: str) -> requests.Session: return _thread_local.session +def _wait_seconds_for_github_403(r: requests.Response, attempt: int) -> float: + """Sleep duration after a 403 from GitHub (primary limit, Retry-After, or fallback).""" + max_sleep = float(_UPLOAD_FOLDER_403_MAX_SLEEP_SEC) + h = r.headers + + remaining = h.get("X-RateLimit-Remaining") + reset_raw = h.get("X-RateLimit-Reset") + try: + if remaining is not None and int(remaining) == 0 and reset_raw is not None: + reset_ts = int(reset_raw) + wait = max(1.0, float(reset_ts) - time.time()) + wait += random.uniform(0, 2) + return min(wait, max_sleep) + except (TypeError, ValueError): + pass + + ra = h.get("Retry-After") + if ra is not None: + try: + wait = float(ra) + if wait < 1.0: + wait = 1.0 + wait += random.uniform(0, 1) + return min(wait, max_sleep) + except (TypeError, ValueError): + pass + + base = 60.0 * (2.0**attempt) + wait = min(base + random.uniform(0, 2), max_sleep) + return wait + + def _create_blob_with_retry( base: str, token: str, repo_path: str, local_path: Path ) -> tuple[str, str]: @@ -64,18 +102,13 @@ def _create_blob_with_retry( last_err = None for attempt in range(_UPLOAD_FOLDER_BLOB_RETRIES): try: - r = session.post(url, json=blob_data, timeout=30) + with _blob_post_semaphore: + r = session.post(url, json=blob_data, timeout=30) if r.status_code == 403: - # GitHub secondary rate limit; wait and retry (cap at our constant) - wait_sec = _UPLOAD_FOLDER_403_WAIT_SEC - try: - from_header = int(r.headers.get("Retry-After", wait_sec)) - wait_sec = min(from_header, _UPLOAD_FOLDER_403_WAIT_SEC) - except (TypeError, ValueError): - pass + wait_sec = _wait_seconds_for_github_403(r, attempt) if attempt < _UPLOAD_FOLDER_BLOB_RETRIES - 1: logger.warning( - "Blob upload 403 (rate limit), waiting %ss before retry (%s)", + "Blob upload 403 (rate limit), waiting %.1fs before retry (%s)", wait_sec, repo_path, ) @@ -105,17 +138,43 @@ def _create_blob_with_retry( def _url_with_token(url: str, token: str) -> str: - """Inject token into GitHub HTTPS URL for auth.""" + """Inject credentials into a GitHub HTTPS URL for Git over HTTPS. + + Uses ``x-access-token:`` as the userinfo segment. Required for + fine-grained PATs (``github_pat_...``); classic PATs work with this form too. + """ if not token: return url + auth = f"x-access-token:{token}" return re.sub( r"^(https://)(github\.com/)", - r"\1" + token + r"@\2", + r"\1" + auth + r"@\2", url, count=1, ) +def sanitize_git_output(text: str) -> str: + """Redact credentials from git stderr/stdout snippets before logging. + + Masks GitHub HTTPS PAT forms and other userinfo-in-URL patterns so logs do not + leak tokens when clone/push echoes the remote URL. + """ + if not text: + return text + out = re.sub( + r"(?i)(x-access-token:)[^@\s]+(@)", + r"\1***\2", + text, + ) + out = re.sub( + r"(?i)(https?://)[^/\s?#]+@", + r"\1@", + out, + ) + return out + + def clone_repo( url_or_slug: str, dest_dir: str | Path, @@ -124,7 +183,11 @@ def clone_repo( depth: Optional[int] = None, ) -> None: """ - Clone a GitHub repo. Uses scraping token by default (read-only). + Clone a GitHub repo. + + If ``token`` is omitted, uses the scraping token (``get_github_token(use="scraping")``). + Callers cloning **private** repos must pass ``token=get_github_token(use="write")`` + (or equivalent) so GitHub authenticates with a PAT that has repository access. """ dest_dir = Path(dest_dir) if token is None: @@ -142,7 +205,8 @@ def clone_repo( cmd = ["git", "clone", clone_url, str(dest_dir)] if depth is not None: cmd.extend(["--depth", str(depth)]) - logger.info("Cloning %s -> %s", url_or_slug, dest_dir) + safe_url_or_slug = sanitize_git_output(url_or_slug) + logger.info("Cloning %s -> %s", safe_url_or_slug, dest_dir) try: subprocess.run( cmd, @@ -153,22 +217,41 @@ def clone_repo( errors="replace", timeout=GIT_CMD_TIMEOUT_SECONDS, ) - except subprocess.TimeoutExpired: + except subprocess.TimeoutExpired as e: + safe_cmd: list[str] = ["git", "clone", safe_url_or_slug, str(dest_dir)] + if depth is not None: + safe_cmd.extend(["--depth", str(depth)]) logger.warning( "git clone timed out after %ss (%s -> %s)", GIT_CMD_TIMEOUT_SECONDS, - url_or_slug, + safe_url_or_slug, dest_dir, ) - raise + raise subprocess.TimeoutExpired( + safe_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) logger.warning( - "git clone failed (%s -> %s), returncode=%s", - url_or_slug, + "git clone failed (%s -> %s), returncode=%s, stderr/stdout_tail=%r", + safe_url_or_slug, dest_dir, e.returncode, + safe_err_tail, ) - raise + # Never re-raise with the real cmd or raw output: they may embed the token. + safe_cmd: list[str] = ["git", "clone", safe_url_or_slug, str(dest_dir)] + if depth is not None: + safe_cmd.extend(["--depth", str(depth)]) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_cmd, safe_stdout, safe_stderr + ) from None def push( @@ -179,12 +262,19 @@ def push( commit_message: Optional[str] = None, add_paths: Optional[list[str | Path]] = None, token: Optional[str] = None, + git_user_name: Optional[str] = None, + git_user_email: Optional[str] = None, ) -> None: """ Push to remote. Uses push token by default. Always runs git add, git commit, then push. Uses commit_message if provided, otherwise "Auto commit in ". add_paths: paths to add (relative to repo_dir); if None, adds all (git add .). + + git_user_name / git_user_email: if set, passed only to the ``git commit`` subprocess + via GIT_AUTHOR_* / GIT_COMMITTER_* env vars (does not modify repo ``git config``). + Any existing GIT_AUTHOR_* / GIT_COMMITTER_* entries are removed from the commit + environment first so ambient or Django-set values are not inherited when unset. """ repo_dir = Path(repo_dir) if token is None: @@ -203,10 +293,27 @@ def push( capture_output=True, text=True, ) + commit_env = dict(os.environ) + for _key in ( + "GIT_AUTHOR_NAME", + "GIT_AUTHOR_EMAIL", + "GIT_COMMITTER_NAME", + "GIT_COMMITTER_EMAIL", + ): + commit_env.pop(_key, None) + if git_user_name: + commit_env["GIT_AUTHOR_NAME"] = git_user_name + commit_env["GIT_COMMITTER_NAME"] = git_user_name + if git_user_email: + commit_env["GIT_AUTHOR_EMAIL"] = git_user_email + commit_env["GIT_COMMITTER_EMAIL"] = git_user_email commit_result = subprocess.run( ["git", "-C", str(repo_dir), "commit", "-m", message], capture_output=True, text=True, + encoding="utf-8", + errors="replace", + env=commit_env, ) if commit_result.returncode != 0: out = (commit_result.stderr or "") + (commit_result.stdout or "") @@ -231,6 +338,9 @@ def push( if branch: cmd.append(branch) logger.info("Pushing %s %s", repo_dir, branch or "(current)") + safe_push_cmd = ["git", "-C", str(repo_dir), "push", remote_url] + if branch: + safe_push_cmd.append(branch) try: subprocess.run( cmd, @@ -241,16 +351,32 @@ def push( errors="replace", timeout=GIT_CMD_TIMEOUT_SECONDS, ) - except subprocess.TimeoutExpired: + except subprocess.TimeoutExpired as e: logger.warning( "git push timed out after %ss (%s)", GIT_CMD_TIMEOUT_SECONDS, repo_dir, ) - raise + raise subprocess.TimeoutExpired( + safe_push_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None except subprocess.CalledProcessError as e: - logger.warning("git push failed (%s), returncode=%s", repo_dir, e.returncode) - raise + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git push failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_push_cmd, safe_stdout, safe_stderr + ) from None def pull( @@ -286,7 +412,144 @@ def pull( if branch: cmd.append(branch) logger.info("Pulling %s %s", repo_dir, branch or "(current)") - subprocess.run(cmd, check=True, capture_output=True, text=True) + safe_pull_cmd = ["git", "-C", str(repo_dir), "pull", remote_url] + if branch: + safe_pull_cmd.append(branch) + try: + subprocess.run( + cmd, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=GIT_CMD_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + "git pull timed out after %ss (%s)", + GIT_CMD_TIMEOUT_SECONDS, + repo_dir, + ) + raise subprocess.TimeoutExpired( + safe_pull_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None + except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git pull failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_pull_cmd, safe_stdout, safe_stderr + ) from None + + +def prepare_repo_for_pull( + repo_dir: str | Path, + *, + remote: str = "origin", + token: Optional[str] = None, +) -> None: + """ + Fetch remote branch refs (prune), remove untracked files, and reset the working tree. + + Use before checkout/pull on a reused clone that may have local changes or lack + remote-tracking refs for branches that exist only on the remote. + """ + repo_dir = Path(repo_dir) + if token is None: + token = get_github_token(use="push") + result = subprocess.run( + ["git", "-C", str(repo_dir), "remote", "get-url", remote], + capture_output=True, + text=True, + check=True, + ) + remote_url = result.stdout.strip() + auth_url = _url_with_token(remote_url, token or "") + + logger.info("Fetching %s refs (prune) in %s", remote, repo_dir) + fetch_cmd = [ + "git", + "-C", + str(repo_dir), + "fetch", + auth_url, + f"+refs/heads/*:refs/remotes/{remote}/*", + "--prune", + ] + safe_fetch_cmd = [ + "git", + "-C", + str(repo_dir), + "fetch", + remote_url, + f"+refs/heads/*:refs/remotes/{remote}/*", + "--prune", + ] + try: + subprocess.run( + fetch_cmd, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=GIT_CMD_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + "git fetch timed out after %ss (%s)", + GIT_CMD_TIMEOUT_SECONDS, + repo_dir, + ) + raise subprocess.TimeoutExpired( + safe_fetch_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None + except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git fetch failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_fetch_cmd, safe_stdout, safe_stderr + ) from None + logger.info("Running git clean -fd in %s", repo_dir) + subprocess.run( + ["git", "-C", str(repo_dir), "clean", "-fd"], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + logger.info("Running git reset --hard in %s", repo_dir) + subprocess.run( + ["git", "-C", str(repo_dir), "reset", "--hard"], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) def fetch_file_content( diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 94201bc0..83da5b87 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -1,17 +1,22 @@ """Tests for github_ops git_ops (clone, push, pull, fetch_file_content, upload_folder_to_github).""" +import subprocess from unittest.mock import MagicMock, patch +import pytest import requests from github_ops.git_ops import ( + GIT_CMD_TIMEOUT_SECONDS, _create_blob_with_retry, _url_with_token, clone_repo, fetch_file_content, pull, get_commit_file_changes, + prepare_repo_for_pull, push, + sanitize_git_output, upload_folder_to_github, ) @@ -26,10 +31,10 @@ def test_url_with_token_empty_token_returns_unchanged(): def test_url_with_token_injects_token_before_github_com(): - """_url_with_token injects token into HTTPS GitHub URL.""" + """_url_with_token uses x-access-token form for GitHub HTTPS Git auth.""" url = "https://github.com/owner/repo.git" out = _url_with_token(url, "secret") - assert out == "https://secret@github.com/owner/repo.git" + assert out == "https://x-access-token:secret@github.com/owner/repo.git" def test_url_with_token_none_like_token_returns_unchanged(): @@ -42,7 +47,29 @@ def test_url_with_token_only_replaces_first_occurrence(): """_url_with_token uses count=1 so only first https://github.com/ is modified.""" url = "https://github.com/boostorg/boost.git" out = _url_with_token(url, "tok") - assert out == "https://tok@github.com/boostorg/boost.git" + assert out == "https://x-access-token:tok@github.com/boostorg/boost.git" + + +# --- sanitize_git_output --- + + +def test_sanitize_git_output_masks_x_access_token(): + raw = "fatal: https://x-access-token:ghp_SUPER_SECRET@github.com/o/r.git not found" + out = sanitize_git_output(raw) + assert "ghp_SUPER_SECRET" not in out + assert "x-access-token:ghp_" not in out + assert "https://@github.com" in out + + +def test_sanitize_git_output_masks_bare_token_userinfo(): + raw = "error cloning https://github_pat_XXXX@github.com/foo/bar.git" + out = sanitize_git_output(raw) + assert "github_pat_XXXX" not in out + assert "https://@" in out + + +def test_sanitize_git_output_empty(): + assert sanitize_git_output("") == "" # --- clone_repo --- @@ -69,9 +96,9 @@ def test_clone_repo_slug_converted_to_https_url(tmp_path): with patch("github_ops.git_ops.subprocess.run", MagicMock()) as run_mock: clone_repo("owner/repo", tmp_path, token="t") call_args = run_mock.call_args[0][0] - assert ( - "https://github.com/owner/repo.git" in call_args[2] - or "t@github.com" in call_args[2] + clone_url = call_args[2] + assert "https://github.com/owner/repo.git" in clone_url or ( + "x-access-token:t@" in clone_url and "github.com/owner/repo.git" in clone_url ) @@ -94,6 +121,27 @@ def test_clone_repo_uses_get_github_token_when_token_not_provided(tmp_path): get_token.assert_called_once_with(use="scraping") +def test_clone_repo_timeout_redacts_token_from_reraised_exception_cmd(tmp_path): + """clone timeout re-raises TimeoutExpired whose cmd omits the PAT (matches real clone cmd).""" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = subprocess.TimeoutExpired( + [ + "git", + "clone", + "https://x-access-token:LEAK@github.com/o/r.git", + str(tmp_path), + ], + 300, + output="", + stderr="", + ) + with pytest.raises(subprocess.TimeoutExpired) as excinfo: + clone_repo("https://github.com/o/r.git", tmp_path, token="LEAK") + assert "LEAK" not in " ".join(excinfo.value.cmd) + assert "https://github.com/o/r.git" in excinfo.value.cmd[2] + assert excinfo.value.timeout == 300 + + # --- push --- @@ -254,6 +302,117 @@ def test_push_commit_failure_without_nothing_to_commit_raises(tmp_path): assert False, "push should have raised on commit failure" +def test_push_failure_redacts_token_from_reraised_exception_cmd(tmp_path): + """git push failure re-raises CalledProcessError whose cmd uses the token-free remote URL.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(returncode=0, stdout="", stderr=""), + MagicMock(returncode=0, stdout="", stderr=""), + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "push", + "https://x-access-token:SECRET@github.com/o/r.git", + "main", + ], + "", + "rejected", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + push(tmp_path, "origin", branch="main", token="SECRET") + err = excinfo.value + cmd_str = " ".join(err.cmd) + assert "SECRET" not in cmd_str + assert remote in cmd_str + assert err.stderr == "rejected" + + +def test_pull_failure_redacts_token_from_reraised_exception_cmd(tmp_path): + """git pull failure re-raises CalledProcessError whose cmd uses the token-free remote URL.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "pull", + "https://x-access-token:XY@github.com/o/r.git", + ], + "", + "error", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + pull(tmp_path, token="XY") + assert "XY" not in " ".join(excinfo.value.cmd) + assert remote in " ".join(excinfo.value.cmd) + + +def test_pull_timeout_redacts_token_from_reraised_exception_cmd(tmp_path): + """git pull timeout re-raises TimeoutExpired whose cmd omits the PAT.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.TimeoutExpired( + [ + "git", + "-C", + str(tmp_path), + "pull", + "https://x-access-token:XY@github.com/o/r.git", + ], + GIT_CMD_TIMEOUT_SECONDS, + output="", + stderr="", + ), + ] + with pytest.raises(subprocess.TimeoutExpired) as excinfo: + pull(tmp_path, token="XY") + assert "XY" not in " ".join(excinfo.value.cmd) + assert remote in " ".join(excinfo.value.cmd) + assert excinfo.value.timeout == GIT_CMD_TIMEOUT_SECONDS + + +def test_prepare_repo_fetch_failure_redacts_token_from_reraised_exception_cmd( + tmp_path, +): + """prepare_repo_for_pull fetch failure re-raises with cmd without embedded PAT.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "fetch", + "https://x-access-token:PAT@github.com/o/r.git", + "+refs/heads/*:refs/remotes/origin/*", + "--prune", + ], + "", + "fetch failed", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + prepare_repo_for_pull(tmp_path, remote="origin", token="PAT") + assert "PAT" not in " ".join(excinfo.value.cmd) + assert remote in excinfo.value.cmd[4] + + # --- pull --- @@ -272,6 +431,7 @@ def test_pull_with_branch_runs_checkout_then_pull(tmp_path): assert calls[0][-1] == "main" assert "pull" in calls[2] assert "main" in calls[2] + assert run_mock.call_args_list[2][1].get("timeout") == GIT_CMD_TIMEOUT_SECONDS def test_pull_without_branch_does_not_run_checkout(tmp_path): @@ -285,6 +445,7 @@ def test_pull_without_branch_does_not_run_checkout(tmp_path): calls = [c[0][0] for c in run_mock.call_args_list] checkout_calls = [c for c in calls if "checkout" in c] assert len(checkout_calls) == 0 + assert run_mock.call_args_list[-1][1].get("timeout") == GIT_CMD_TIMEOUT_SECONDS def test_pull_uses_get_github_token_when_token_not_provided(tmp_path): @@ -496,6 +657,67 @@ def test_create_blob_with_retry_returns_sha_on_success(): mock_session.post.assert_called_once() +def test_create_blob_with_retry_403_waits_using_rate_limit_reset(): + """_create_blob_with_retry sleeps until X-RateLimit-Reset when Remaining is 0.""" + mock_403 = MagicMock() + mock_403.status_code = 403 + mock_403.headers = { + "X-RateLimit-Remaining": "0", + "X-RateLimit-Reset": "1007", + } + mock_ok = MagicMock() + mock_ok.status_code = 201 + mock_ok.json.return_value = {"sha": "sha_after_reset_wait"} + mock_ok.raise_for_status = MagicMock() + + mock_session = MagicMock() + mock_session.post.side_effect = [mock_403, mock_ok] + mock_path = MagicMock() + mock_path.read_bytes.return_value = b"x" + + with patch("github_ops.git_ops._get_worker_session", return_value=mock_session): + with patch("github_ops.git_ops.time.time", return_value=1000.0): + with patch("github_ops.git_ops.random.uniform", return_value=0.0): + with patch("github_ops.git_ops.time.sleep") as sleep_mock: + out = _create_blob_with_retry( + "https://api.github.com/repos/o/r", + "token", + "f.txt", + mock_path, + ) + assert out == ("f.txt", "sha_after_reset_wait") + sleep_mock.assert_called_once_with(7.0) + assert mock_session.post.call_count == 2 + + +def test_create_blob_with_retry_403_exponential_when_no_headers(): + """_create_blob_with_retry uses exponential backoff on 403 without rate-limit headers.""" + mock_403 = MagicMock() + mock_403.status_code = 403 + mock_403.headers = {} + mock_ok = MagicMock() + mock_ok.status_code = 201 + mock_ok.json.return_value = {"sha": "sha_ok"} + mock_ok.raise_for_status = MagicMock() + + mock_session = MagicMock() + mock_session.post.side_effect = [mock_403, mock_ok] + mock_path = MagicMock() + mock_path.read_bytes.return_value = b"x" + + with patch("github_ops.git_ops._get_worker_session", return_value=mock_session): + with patch("github_ops.git_ops.random.uniform", return_value=0.0): + with patch("github_ops.git_ops.time.sleep") as sleep_mock: + out = _create_blob_with_retry( + "https://api.github.com/repos/o/r", + "token", + "f.txt", + mock_path, + ) + assert out == ("f.txt", "sha_ok") + sleep_mock.assert_called_once_with(60.0) + + # --- get_commit_file_changes --- diff --git a/operations/md_ops/github_export.py b/operations/md_ops/github_export.py index 252cc7d0..3abf105e 100644 --- a/operations/md_ops/github_export.py +++ b/operations/md_ops/github_export.py @@ -1,12 +1,14 @@ """ Export synced GitHub issues/PRs as Markdown files into a folder structure -suitable for pushing to a private GitHub repository. +suitable for pushing to a target GitHub repository. Public API: write_md_files(owner, repo, issue_numbers, pr_numbers, output_dir, folder_prefix) detect_renames(remote_tree, new_files) -> list[str] detect_renames_from_dirs(owner, repo, branch, new_files, *, token) -> list[str] Use for large repos (100k+ files); lists only the directories we write to. + detect_stale_titled_paths(base_dir, new_files) -> list[str] + Local Path listing (md_export or clone); same #n title-rename rules; no API. Folder structure produced: //issues/YYYY/YYYY-MM/# - .md @@ -155,7 +157,12 @@ def write_md_files( created_at = _parse_dt(created_at_raw) out_path = _md_path( - output_dir, folder_prefix, "pull_requests", created_at, number, title + output_dir, + folder_prefix, + "pull_requests", + created_at, + number, + title, ) try: md_content = pr_json_to_md(pr_data) @@ -170,6 +177,40 @@ def write_md_files( return new_files +def _stale_titled_paths_vs_listing( + new_files: dict[str, str], + files_by_dir: dict[str, list[tuple[str, str]]], + *, + log_prefix: str = "stale_titled", +) -> list[str]: + """Paths to remove: same directory + same #n - prefix as a new file, different filename.""" + if not new_files or not files_by_dir: + return [] + + delete_paths: list[str] = [] + for new_repo_rel in new_files: + new_filename = new_repo_rel.rsplit("/", 1)[-1] + new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" + + m = _NUMBER_PREFIX.match(new_filename) + if not m: + continue + number_str = m.group(1) + prefix = f"#{number_str} - " + + for listed_filename, listed_path in files_by_dir.get(new_dir, []): + if listed_filename.startswith(prefix) and listed_filename != new_filename: + logger.debug( + "%s: %r → %r (title changed, will delete old)", + log_prefix, + listed_path, + new_repo_rel, + ) + delete_paths.append(listed_path) + + return sorted(set(delete_paths)) + + def detect_renames( remote_tree: list[dict], new_files: dict[str, str], @@ -192,7 +233,7 @@ def detect_renames( return [] # Build a lookup: directory → list of (filename, full_path) for blob entries - remote_by_dir: dict[str, list[tuple[str, str]]] = {} + files_by_dir: dict[str, list[tuple[str, str]]] = {} for item in remote_tree: if item.get("type") != "blob": continue @@ -201,29 +242,11 @@ def detect_renames( continue parent = path.rsplit("/", 1)[0] if "/" in path else "" filename = path.rsplit("/", 1)[-1] - remote_by_dir.setdefault(parent, []).append((filename, path)) - - delete_paths: list[str] = [] - for new_repo_rel in new_files: - new_filename = new_repo_rel.rsplit("/", 1)[-1] - new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" - - m = _NUMBER_PREFIX.match(new_filename) - if not m: - continue - number_str = m.group(1) - prefix = f"#{number_str} - " - - for remote_filename, remote_path in remote_by_dir.get(new_dir, []): - if remote_filename.startswith(prefix) and remote_filename != new_filename: - logger.debug( - "detect_renames: %r → %r (title changed, will delete old)", - remote_path, - new_repo_rel, - ) - delete_paths.append(remote_path) + files_by_dir.setdefault(parent, []).append((filename, path)) - return delete_paths + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_renames" + ) def detect_renames_from_dirs( @@ -241,7 +264,7 @@ def detect_renames_from_dirs( only a small number of API calls are made. Args: - owner: Repository owner (e.g. private repo owner). + owner: Repository owner (markdown publish target). repo: Repository name. branch: Branch name. new_files: Dict of {repo_relative_path: local_path} from write_md_files(). @@ -260,32 +283,64 @@ def detect_renames_from_dirs( else: dirs.add("") - delete_paths: list[str] = [] + files_by_dir: dict[str, list[tuple[str, str]]] = {} for dir_path in sorted(dirs): - remote_paths = list_remote_directory(owner, repo, branch, dir_path, token=token) - for remote_path in remote_paths: + for remote_path in list_remote_directory( + owner, repo, branch, dir_path, token=token + ): filename = remote_path.rsplit("/", 1)[-1] - m = _NUMBER_PREFIX.match(filename) - if not m: - continue - number_str = m.group(1) - prefix = f"#{number_str} - " remote_dir = remote_path.rsplit("/", 1)[0] if "/" in remote_path else "" - for new_repo_rel in new_files: - new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" - if new_dir != remote_dir: - continue - new_filename = new_repo_rel.rsplit("/", 1)[-1] - if new_filename.startswith(prefix) and new_filename != filename: - logger.debug( - "detect_renames_from_dirs: %r → %r (title changed, will delete old)", - remote_path, - new_repo_rel, - ) - delete_paths.append(remote_path) - break - - return delete_paths + files_by_dir.setdefault(remote_dir, []).append((filename, remote_path)) + + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_renames_from_dirs" + ) + + +def detect_stale_titled_paths( + base_dir: Path, + new_files: dict[str, str], +) -> list[str]: + """Find paths under base_dir to delete (old title) using local directory listings. + + Same rules as detect_renames_from_dirs, but lists each affected directory with + Path.iterdir (no GitHub API). Use on md_export and on a clone after pull. + + Args: + base_dir: Root to resolve paths against (md_export root or repo clone root). + new_files: Dict of {repo_relative_path: local_path} from write_md_files(). + + Returns: + Paths relative to base_dir (posix) that should be unlinked. + """ + base_dir = base_dir.resolve() + if not new_files: + return [] + + dirs: set[str] = set() + for repo_rel in new_files: + if "/" in repo_rel: + dirs.add(repo_rel.rsplit("/", 1)[0]) + else: + dirs.add("") + + files_by_dir: dict[str, list[tuple[str, str]]] = {} + for dir_path in dirs: + scan = base_dir if dir_path == "" else base_dir / dir_path + if not scan.is_dir(): + continue + for p in scan.iterdir(): + if p.name.startswith(".") or p.name == ".git": + continue + if not p.is_file() or p.suffix.lower() != ".md": + continue + rel = p.relative_to(base_dir).as_posix() + parent = rel.rsplit("/", 1)[0] if "/" in rel else "" + files_by_dir.setdefault(parent, []).append((p.name, rel)) + + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_stale_titled_paths" + ) def _parse_dt(value: object) -> Optional[datetime]: diff --git a/operations/tests/test_github_export.py b/operations/tests/test_github_export.py new file mode 100644 index 00000000..4679d1ad --- /dev/null +++ b/operations/tests/test_github_export.py @@ -0,0 +1,113 @@ +"""Tests for operations.md_ops.github_export rename and stale-path helpers.""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +from operations.md_ops.github_export import ( + detect_renames, + detect_renames_from_dirs, + detect_stale_titled_paths, +) + + +def test_detect_renames_from_dirs_empty_new_files(): + """No new files means nothing to compare.""" + assert detect_renames_from_dirs("o", "r", "main", {}, token="t") == [] + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_finds_old_title(mock_list_remote: MagicMock): + """Remote dir lists old filename; new_files has new title for same #5.""" + mock_list_remote.return_value = [ + "issues/2024/2024-03/#5 - Old title.md", + ] + new_files = { + "issues/2024/2024-03/#5 - New title.md": "/tmp/x", + } + out = detect_renames_from_dirs("own", "repo", "main", new_files, token="tok") + assert out == ["issues/2024/2024-03/#5 - Old title.md"] + mock_list_remote.assert_called() + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_no_conflict(mock_list_remote: MagicMock): + """Remote only has the same filename as new_files.""" + mock_list_remote.return_value = [ + "issues/2024/2024-03/#5 - Same.md", + ] + new_files = {"issues/2024/2024-03/#5 - Same.md": "/tmp/x"} + assert detect_renames_from_dirs("o", "r", "main", new_files, token="t") == [] + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_non_numbered_md_ignored( + mock_list_remote: MagicMock, +): + """Files not matching #n - prefix are ignored.""" + mock_list_remote.return_value = ["issues/2024/2024-03/README.md"] + new_files = {"issues/2024/2024-03/#5 - T.md": "/tmp/x"} + assert detect_renames_from_dirs("o", "r", "main", new_files, token="t") == [] + + +def test_detect_renames_success_matches_tree(): + """detect_renames uses same semantics as directory listing.""" + tree = [ + {"type": "blob", "path": "issues/2024/2024-03/#5 - Old title.md"}, + ] + new_files = {"issues/2024/2024-03/#5 - New title.md": "/x"} + assert detect_renames(tree, new_files) == ["issues/2024/2024-03/#5 - Old title.md"] + + +def test_detect_renames_empty_tree(): + assert detect_renames([], {"a/b.md": "/x"}) == [] + + +def test_detect_stale_titled_paths_finds_old_file_on_disk(tmp_path: Path): + """Local directory has old title; new_files points to new title.""" + d = tmp_path / "issues" / "2024" / "2024-03" + d.mkdir(parents=True) + old = d / "#5 - Old title.md" + old.write_text("old", encoding="utf-8") + new_files = {"issues/2024/2024-03/#5 - New title.md": str(d / "#5 - New title.md")} + stale = detect_stale_titled_paths(tmp_path, new_files) + assert stale == ["issues/2024/2024-03/#5 - Old title.md"] + + +def test_detect_stale_titled_paths_only_canonical(tmp_path: Path): + """Only the new filename present → no stale paths.""" + d = tmp_path / "pull_requests" / "2024" / "2024-01" + d.mkdir(parents=True) + f = d / "#10 - Only.md" + f.write_text("x", encoding="utf-8") + new_files = {"pull_requests/2024/2024-01/#10 - Only.md": str(f)} + assert detect_stale_titled_paths(tmp_path, new_files) == [] + + +def test_detect_stale_titled_paths_missing_month_dir(tmp_path: Path): + """Missing directory is treated as empty.""" + new_files = {"issues/2024/2024-99/#1 - A.md": "/nope"} + assert detect_stale_titled_paths(tmp_path, new_files) == [] + + +def test_detect_stale_titled_paths_empty_new_files(tmp_path: Path): + assert detect_stale_titled_paths(tmp_path, {}) == [] + + +def test_detect_stale_titled_paths_union_two_dirs(tmp_path: Path): + """Multiple parent dirs each with stale file.""" + for sub, old_name in ( + ("issues/2024/2024-01", "#1 - Old.md"), + ("issues/2024/2024-02", "#2 - Was.md"), + ): + p = tmp_path / sub + p.mkdir(parents=True) + (p / old_name).write_text("o", encoding="utf-8") + new_files = { + "issues/2024/2024-01/#1 - New.md": "/a", + "issues/2024/2024-02/#2 - Now.md": "/b", + } + stale = set(detect_stale_titled_paths(tmp_path, new_files)) + assert stale == { + "issues/2024/2024-01/#1 - Old.md", + "issues/2024/2024-02/#2 - Was.md", + } diff --git a/requirements.txt b/requirements.txt index 9c0a5039..cdd67211 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,8 +24,15 @@ slack-bolt>=1.18 pytz>=2024.1 selenium>=4.35 +# wg21_paper_tracker app +beautifulsoup4>=4.12.0 +# cppa_youtube_script_tracker app (YouTube Data API v3 + VTT transcript download) +google-api-python-client>=2.100 +yt-dlp==2026.2.4 + # slack_event_handler (GitHub PR comments) PyGithub>=2.0 + # cppa_pinecone_sync app pinecone>=3.0 langchain-core>=0.1 diff --git a/wg21_paper_tracker/__init__.py b/wg21_paper_tracker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py new file mode 100644 index 00000000..bd57f4c2 --- /dev/null +++ b/wg21_paper_tracker/admin.py @@ -0,0 +1,41 @@ +from django.contrib import admin +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@admin.register(WG21Mailing) +class WG21MailingAdmin(admin.ModelAdmin): + list_display = ("mailing_date", "title", "created_at", "updated_at") + search_fields = ("mailing_date", "title") + ordering = ("-mailing_date",) + + +class WG21PaperAuthorInline(admin.TabularInline): + model = WG21PaperAuthor + extra = 1 + raw_id_fields = ("profile",) + ordering = ("author_order", "id") + + +@admin.register(WG21Paper) +class WG21PaperAdmin(admin.ModelAdmin): + list_display = ( + "paper_id", + "year", + "title", + "document_date", + "mailing", + "subgroup", + "is_downloaded", + ) + search_fields = ("paper_id", "title", "url", "subgroup") + list_filter = ("is_downloaded", "subgroup", "mailing", "year") + ordering = ("-document_date", "-paper_id") + inlines = [WG21PaperAuthorInline] + + +@admin.register(WG21PaperAuthor) +class WG21PaperAuthorAdmin(admin.ModelAdmin): + list_display = ("paper", "profile", "author_order", "created_at") + search_fields = ("paper__paper_id", "profile__display_name") + raw_id_fields = ("paper", "profile") + ordering = ("paper", "author_order", "id") diff --git a/wg21_paper_tracker/apps.py b/wg21_paper_tracker/apps.py new file mode 100644 index 00000000..d6f09d9b --- /dev/null +++ b/wg21_paper_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class Wg21PaperTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "wg21_paper_tracker" + verbose_name = "WG21 Paper Tracker" diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py new file mode 100644 index 00000000..05f6e98c --- /dev/null +++ b/wg21_paper_tracker/fetcher.py @@ -0,0 +1,213 @@ +""" +Fetcher for WG21 Papers. +Scrapes the WG21 papers index and specific mailing tables. +""" + +import re +import urllib.parse +from typing import Optional + +import requests +from bs4 import BeautifulSoup +from bs4.element import Tag + +import logging + +logger = logging.getLogger(__name__) + +BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" + +_MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$") +# Paper link in first column: e.g. p1234r0.pdf, n4920.html, sd-9.md +_PAPER_LINK_PATTERN = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) + + +def extract_paper_metadata_from_table_row( + cells: list[Tag], + page_url: str, +) -> Optional[dict]: + """ + Extract paper metadata from a WG21 mailing table row (td/th cells). + + Current year pages (e.g. 2026) use eight columns:: + + WG21 Number | Title | Author | Document Date | Mailing Date | + Previous Version | Subgroup | Disposition + + So **subgroup is index 6**, not 4. Index 4 is *mailing date* (string as shown on the site). + + Older pages used a shorter row (five data columns); then subgroup was at index 4. + If ``len(cells) >= 8`` we use the 8-column layout; otherwise we keep the legacy mapping. + """ + if not cells: + return None + + first_cell = cells[0] + base = urllib.parse.urlparse(BASE_URL) + + title = "" + if len(cells) > 1: + title = cells[1].text.strip() + + authors: list[str] = [] + if len(cells) > 2: + authors_raw = cells[2].text.strip() + if authors_raw: + authors = [ + a.strip() for a in re.split(r",| and ", authors_raw) if a.strip() + ] + + document_date = None + if len(cells) > 3: + date_str = cells[3].text.strip() + if date_str: + document_date = date_str + + # 8+ columns: mailing date [4], previous version [5], subgroup [6], disposition [7] + subgroup = "" + if len(cells) >= 8: + subgroup = cells[6].text.strip() + elif len(cells) > 4: + subgroup = cells[4].text.strip() + + for link in first_cell.find_all("a", href=True): + href = link.get("href", "") + match = _PAPER_LINK_PATTERN.search(href) + if not match: + continue + + paper_url = urllib.parse.urljoin(page_url, href) + parsed = urllib.parse.urlparse(paper_url) + if parsed.scheme not in ("https", "http") or parsed.netloc != base.netloc: + logger.warning("Skipping off-origin paper URL %s", paper_url) + continue + + paper_id = match.group(1).lower() + file_ext = match.group(2).lower() + filename = match.group(0).lower() + + return { + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + } + + return None + + +def _find_table_in_section(anchor) -> Optional[Tag]: + """ + Find the first <table> that belongs to the current mailing section. + Stops at the next mailing anchor (id/name matching mailingYYYY-MM) so we + do not attribute another mailing's table to this section. + """ + if not anchor: + return None + anchor_id = anchor.get("id") or anchor.get("name") or "" + if not _MAILING_ANCHOR_RE.match(anchor_id): + return None + for elem in anchor.next_elements: + if not hasattr(elem, "name"): # NavigableString, etc. + continue + if elem is anchor: + continue + if elem.name == "table": + return elem + if not hasattr(elem, "get"): # e.g. NavigableString + continue + next_id = elem.get("id") or elem.get("name") or "" + if next_id and _MAILING_ANCHOR_RE.match(next_id) and next_id != anchor_id: + return None # next section start; no table in this section + return None + + +def fetch_all_mailings() -> list[dict]: + """ + Fetch the main index and extract all mailings. + Returns a list of dicts: + - mailing_date (e.g. '2025-02') + - title (e.g. '2025-02 pre-Hagenberg mailing') + - year (e.g. '2025') + List is in the order found on the page (usually newest first). + """ + logger.info("Fetching WG21 main index: %s/", BASE_URL) + try: + response = requests.get(f"{BASE_URL}/", timeout=30) + response.raise_for_status() + except requests.RequestException: + logger.exception("Failed to fetch WG21 index.") + return [] + + # The mailings are listed in a markdown-like syntax or links + # Typically: <a href="2025/#mailing2025-02">2025-02 pre-Hagenberg mailing</a> + # Let's parse with BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + mailings = [] + + # We look for links pointing to year/#mailingYYYY-MM + pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$") + + for a in soup.find_all("a", href=True): + href = a["href"] + match = pattern.search(href) + if match: + year, mailing_date = match.groups() + title = a.text.strip() + mailings.append( + {"mailing_date": mailing_date, "title": title, "year": year} + ) + + return mailings + + +def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: + """ + Fetch the papers for a specific mailing from the year page. + Returns a list of paper dicts. + """ + url = f"{BASE_URL}/{year}/" + logger.info("Fetching mailing %s from %s", mailing_date, url) + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + except requests.RequestException: + logger.exception("Failed to fetch year page %s.", year) + return [] + + soup = BeautifulSoup(response.text, "html.parser") + anchor_id = f"mailing{mailing_date}" + anchor = soup.find(id=anchor_id) or soup.find(attrs={"name": anchor_id}) + if not anchor: + logger.warning("Anchor %s not found on %s", anchor_id, url) + return [] + + table = _find_table_in_section(anchor) + if not table: + logger.warning("No table found after anchor %s", anchor_id) + return [] + + paper_urls = [] + + for row in table.find_all("tr"): + cells = row.find_all(["td", "th"]) + if not cells or any(cell.get("colspan") for cell in cells): + continue + + paper = extract_paper_metadata_from_table_row(cells, url) + if paper: + paper_urls.append(paper) + + # Remove exact duplicates (same filename) + seen = set() + unique_papers = [] + for p in paper_urls: + if p["filename"] not in seen: + seen.add(p["filename"]) + unique_papers.append(p) + + return unique_papers diff --git a/wg21_paper_tracker/management/__init__.py b/wg21_paper_tracker/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wg21_paper_tracker/management/commands/__init__.py b/wg21_paper_tracker/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py new file mode 100644 index 00000000..824617a8 --- /dev/null +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -0,0 +1,301 @@ +""" +Management command: import_wg21_metadata_from_csv + +Reads workspace/wg21_paper_tracker/metadata.csv (or a given path) and fills +WG21Mailing, WG21Paper, and WG21PaperAuthor using get_or_create_mailing and +get_or_create_paper. Handles missing mailing_date via a placeholder mailing +(unknown / Unknown). +""" + +from __future__ import annotations + +import csv +import logging +import re +from dataclasses import dataclass +from datetime import date +from pathlib import Path +from typing import Optional + +from django.core.management.base import BaseCommand, CommandError +from django.db import IntegrityError +from django.utils.dateparse import parse_date + +from wg21_paper_tracker.models import WG21Paper +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, + get_or_create_paper_author, +) +from wg21_paper_tracker.workspace import get_workspace_root + +logger = logging.getLogger(__name__) + +MAILING_DATE_PATTERN = re.compile(r"^\d{4}-\d{2}$") +TITLE_MAX_LENGTH = 1024 +PLACEHOLDER_MAILING_DATE = "unknown" +PLACEHOLDER_MAILING_TITLE = "Unknown" + + +def _norm(s: str) -> str: + """Return the string stripped of leading/trailing whitespace, or empty string if None.""" + return (s or "").strip() + + +def _normalize_title(raw: str) -> str: + """Replace internal newlines with space and truncate to model max_length.""" + if not raw: + return "" + one_line = " ".join(raw.split()) + return one_line[:TITLE_MAX_LENGTH] if len(one_line) > TITLE_MAX_LENGTH else one_line + + +def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]: + """ + Return (mailing_date, title) for this row. + If CSV mailing_date is non-empty and YYYY-MM, use it with synthetic title. + Otherwise use placeholder mailing_date="unknown", title="Unknown". + """ + cleaned = _norm(csv_mailing_date) + if cleaned and MAILING_DATE_PATTERN.match(cleaned): + return cleaned, f"{cleaned} (from metadata)" + return PLACEHOLDER_MAILING_DATE, PLACEHOLDER_MAILING_TITLE + + +def _parse_document_date(date_str: str): + """Return date or None from CSV date column (e.g. YYYY-MM-DD). Invalid values return None.""" + cleaned = _norm(date_str) + if not cleaned: + return None + # try: + return parse_date(cleaned) + # except (ValueError, TypeError): + # return None + + +def _author_names_from_csv(author_str: str) -> list[str]: + """Split author column by comma, strip each, drop empty.""" + cleaned = _norm(author_str) + if not cleaned: + return [] + return [a.strip() for a in cleaned.split(",") if a.strip()] + + +def _read_csv_rows(csv_path: Path): + """Yield dicts for each row with normalized keys and values.""" + with open(csv_path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + out = {} + for k, v in row.items(): + if k is None: + continue + key = k.strip().lower() + out[key] = _norm(v) if v is not None else "" + # Normalize title (multi-line -> single line, truncate) + if "title" in out: + out["title"] = _normalize_title(out["title"]) + yield out + + +@dataclass(frozen=True) +class _CsvImportRow: + paper_id: str + url: str + mailing_date: str + mailing_title: str + document_date: Optional[date] + year: Optional[int] + title: str + subgroup: str + author_names: list[str] + + +def _parse_csv_import_row(row: dict) -> _CsvImportRow | None: + """Return parsed row, or None when paper_id or url is missing.""" + paper_id = (row.get("paper_id", "") or "").strip().lower() + url = row.get("url", "") + if not paper_id or not url: + return None + + mailing_date, mailing_title = _resolve_mailing_date(row.get("mailing_date", "")) + document_date = _parse_document_date(row.get("date", "")) + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): + year = int(mailing_date[:4]) + elif document_date is not None: + year = document_date.year + else: + year = None + title = row.get("title", "") or paper_id + subgroup = row.get("subgroup", "") + author_names = _author_names_from_csv(row.get("author", "")) + return _CsvImportRow( + paper_id=paper_id, + url=url, + mailing_date=mailing_date, + mailing_title=mailing_title, + document_date=document_date, + year=year, + title=title, + subgroup=subgroup, + author_names=author_names, + ) + + +def _log_dry_run_row(parsed: _CsvImportRow) -> None: + logger.info( + "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", + parsed.paper_id, + parsed.mailing_date, + parsed.document_date, + len(parsed.author_names), + ) + + +def _attach_csv_authors_to_paper(paper: WG21Paper, author_names: list[str]) -> None: + from cppa_user_tracker.services import ( + get_or_create_wg21_paper_author_profile, + ) + + for i, name in enumerate(author_names): + profile, _ = get_or_create_wg21_paper_author_profile(name) + get_or_create_paper_author(paper, profile, i + 1) + + +def _update_paper_on_integrity_error( + parsed: _CsvImportRow, exc: IntegrityError, stats: dict +) -> None: + mailing, _ = get_or_create_mailing(parsed.mailing_date, parsed.mailing_title) + try: + lookup_year = parsed.year if parsed.year is not None else 0 + paper = WG21Paper.objects.filter( + paper_id=parsed.paper_id, year=lookup_year + ).first() + if paper is None: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, exc) + return + paper.url = parsed.url + paper.title = parsed.title + paper.document_date = parsed.document_date + paper.mailing = mailing + paper.subgroup = parsed.subgroup + if parsed.year is not None: + paper.year = parsed.year + paper.save() + stats["papers_updated"] += 1 + if parsed.author_names: + _attach_csv_authors_to_paper(paper, parsed.author_names) + except Exception: + stats["skipped"] += 1 + logger.exception( + "Error for paper_id=%s (after IntegrityError).", + parsed.paper_id, + ) + + +def _upsert_paper_from_csv_row(parsed: _CsvImportRow, stats: dict) -> None: + try: + mailing, mailing_created = get_or_create_mailing( + parsed.mailing_date, parsed.mailing_title + ) + if mailing_created: + stats["mailings_created"] += 1 + + _paper, paper_created = get_or_create_paper( + paper_id=parsed.paper_id, + url=parsed.url, + title=parsed.title, + document_date=parsed.document_date, + mailing=mailing, + subgroup=parsed.subgroup, + author_names=parsed.author_names if parsed.author_names else None, + year=parsed.year, + ) + if paper_created: + stats["papers_created"] += 1 + else: + stats["papers_updated"] += 1 + except IntegrityError as e: + _update_paper_on_integrity_error(parsed, e, stats) + except Exception as e: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, e) + + +class Command(BaseCommand): + help = ( + "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). " + "CSV columns: filename, paper_id, url, title, author, date, mailing_date, subgroup. " + "When mailing_date is empty, papers are linked to a single 'unknown' mailing." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--csv-file", + type=Path, + default=None, + help="Path to metadata CSV (default: workspace/wg21_paper_tracker/metadata.csv)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only read CSV and report what would be done; do not write to DB.", + ) + + def handle(self, *args, **options): + csv_path = options.get("csv_file") or (get_workspace_root() / "metadata.csv") + dry_run = options["dry_run"] + + if not csv_path.exists(): + raise CommandError(f"File not found: {csv_path}") + + if dry_run: + logger.info("Dry run: no DB writes.") + + stats = { + "rows": 0, + "skipped": 0, + "mailings_created": 0, + "papers_created": 0, + "papers_updated": 0, + } + + for row in _read_csv_rows(csv_path): + stats["rows"] += 1 + try: + parsed = _parse_csv_import_row(row) + except Exception as e: + stats["skipped"] += 1 + paper_id = (row.get("paper_id", "") or "").strip().lower() + logger.error( + "Error parsing document date for paper_id=%s: %s", + paper_id, + e, + ) + continue + + if parsed is None: + stats["skipped"] += 1 + if stats["skipped"] <= 5: + logger.debug( + "Skipping row: missing paper_id or url: %s", + row.get("paper_id", "") or row.get("url", "")[:50], + ) + continue + + if dry_run: + _log_dry_run_row(parsed) + continue + + _upsert_paper_from_csv_row(parsed, stats) + + logger.info( + "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d", + stats["rows"], + stats["skipped"], + stats["mailings_created"], + stats["papers_created"], + stats["papers_updated"], + ) + logger.info("Done.") diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py new file mode 100644 index 00000000..3f0965d8 --- /dev/null +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -0,0 +1,158 @@ +""" +Management command for WG21 Paper Tracker. +Runs the pipeline to fetch new mailings, upsert paper metadata in the DB, and optionally +trigger a GitHub repository_dispatch so another repo can download and convert documents. +""" + +import logging + +import requests +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from wg21_paper_tracker.pipeline import run_tracker_pipeline + +logger = logging.getLogger(__name__) + +GITHUB_DISPATCH_URL = "https://api.github.com/repos/{repo}/dispatches" + + +def trigger_github_repository_dispatch( + repo: str, + event_type: str, + token: str, + paper_urls: list[str], +) -> None: + """POST repository_dispatch with client_payload {"papers": [<url>, ...]}.""" + url = GITHUB_DISPATCH_URL.format(repo=repo.strip()) + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token.strip()}", + "X-GitHub-Api-Version": "2022-11-28", + } + body = { + "event_type": event_type, + "client_payload": {"papers": paper_urls}, + } + logger.info( + "Sending repository_dispatch to %s (event_type=%s, %d URLs).", + repo, + event_type, + len(paper_urls), + ) + response = requests.post(url, json=body, headers=headers, timeout=30) + if not response.ok: + logger.error( + "GitHub repository_dispatch failed: %s %s", + response.status_code, + response.text, + ) + response.raise_for_status() + + +class Command(BaseCommand): + """Run WG21 paper tracker and optionally trigger GitHub repository_dispatch.""" + + help = ( + "Run WG21 paper tracker (scrape, DB update) and send new paper URLs via " + "repository_dispatch when enabled." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--dry-run", + action="store_true", + help="Only log what would be done; do not run the pipeline or dispatch.", + ) + parser.add_argument( + "--from-date", + dest="from_date", + metavar="YYYY-MM", + default=None, + help=( + "Process mailings with mailing_date >= YYYY-MM (WG21 / CSV style). " + "Backfills from that mailing onward; without --to-date, no upper cap." + ), + ) + parser.add_argument( + "--to-date", + dest="to_date", + metavar="YYYY-MM", + default=None, + help=( + "Upper bound: mailing_date <= YYYY-MM. With --from-date, inclusive range; " + "without --from-date, still only mailings newer than DB latest (capped at to)." + ), + ) + + def handle(self, *args, **options): + dry_run = options.get("dry_run", False) + from_date = options.get("from_date") + to_date = options.get("to_date") + if from_date is not None: + from_date = from_date.strip() + if not from_date: + from_date = None + if to_date is not None: + to_date = to_date.strip() + if not to_date: + to_date = None + if dry_run: + if from_date or to_date: + logger.info( + "Dry run: skipping pipeline and GitHub dispatch " + "(from=%r, to=%r).", + from_date, + to_date, + ) + else: + logger.info("Dry run: skipping pipeline and GitHub dispatch.") + return + + logger.info("Starting WG21 Paper Tracker...") + + try: + result = run_tracker_pipeline( + from_mailing_date=from_date, + to_mailing_date=to_date, + ) + n = result.new_paper_count + logger.info("Recorded %d new paper(s); %d URL(s) for dispatch.", n, n) + + if not n: + logger.info("No new papers in this run. Skipping GitHub dispatch.") + return + + repo = getattr(settings, "WG21_GITHUB_DISPATCH_REPO", "") or "" + token = getattr(settings, "WG21_GITHUB_DISPATCH_TOKEN", "") or "" + enabled = getattr(settings, "WG21_GITHUB_DISPATCH_ENABLED", False) + event_type = getattr( + settings, + "WG21_GITHUB_DISPATCH_EVENT_TYPE", + "wg21_papers_convert", + ) + + if not enabled or not repo or not token: + logger.warning( + "Skipping GitHub dispatch: set WG21_GITHUB_DISPATCH_ENABLED=True " + "and configure WG21_GITHUB_DISPATCH_REPO and " + "WG21_GITHUB_DISPATCH_TOKEN." + ) + return + try: + trigger_github_repository_dispatch( + repo, + event_type, + token, + list(result.new_paper_urls), + ) + logger.info("repository_dispatch sent successfully.") + except Exception: + logger.exception("Failed to send repository_dispatch.") + raise + + except ValueError as e: + raise CommandError(str(e)) from e + except Exception as e: + logger.exception("WG21 Paper Tracker failed: %s", e) + raise diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py new file mode 100644 index 00000000..9c6b4d68 --- /dev/null +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -0,0 +1,133 @@ +# Merged initial migration: WG21 Mailing, WG21 Paper (year not null), WG21 Paper Author + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), + ] + + operations = [ + migrations.CreateModel( + name="WG21Mailing", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "mailing_date", + models.CharField(db_index=True, max_length=7, unique=True), + ), + ("title", models.CharField(max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "verbose_name": "WG21 Mailing", + "verbose_name_plural": "WG21 Mailings", + "db_table": "wg21_paper_tracker_wg21mailing", + "ordering": ["-mailing_date"], + }, + ), + migrations.CreateModel( + name="WG21Paper", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("paper_id", models.CharField(db_index=True, max_length=255)), + ("url", models.URLField(max_length=1024)), + ("title", models.CharField(db_index=True, max_length=1024)), + ( + "document_date", + models.DateField(blank=True, db_index=True, null=True), + ), + ("year", models.IntegerField(db_index=True, default=0)), + ( + "subgroup", + models.CharField( + blank=True, db_index=True, max_length=255 + ), + ), + ( + "is_downloaded", + models.BooleanField(db_index=True, default=False), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "mailing", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="wg21_paper_tracker.wg21mailing", + ), + ), + ], + options={ + "verbose_name": "WG21 Paper", + "verbose_name_plural": "WG21 Papers", + "db_table": "wg21_paper_tracker_wg21paper", + "ordering": ["-document_date", "-paper_id", "-year"], + "unique_together": {("paper_id", "year")}, + }, + ), + migrations.CreateModel( + name="WG21PaperAuthor", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("author_order", models.PositiveIntegerField(blank=True, null=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "paper", + models.ForeignKey( + db_column="paper_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="authors", + to="wg21_paper_tracker.wg21paper", + ), + ), + ( + "profile", + models.ForeignKey( + db_column="profile_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="cppa_user_tracker.wg21paperauthorprofile", + ), + ), + ], + options={ + "verbose_name": "WG21 Paper Author", + "verbose_name_plural": "WG21 Paper Authors", + "db_table": "wg21_paper_tracker_wg21paperauthor", + "ordering": ["id"], + "unique_together": {("paper", "profile")}, + }, + ), + ] diff --git a/wg21_paper_tracker/migrations/__init__.py b/wg21_paper_tracker/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py new file mode 100644 index 00000000..fede57ba --- /dev/null +++ b/wg21_paper_tracker/models.py @@ -0,0 +1,79 @@ +""" +Models per docs/Schema.md section 7: WG21 Papers Tracker. +References cppa_user_tracker.WG21PaperAuthorProfile (section 1) as author. +""" + +from django.db import models + + +class WG21Mailing(models.Model): + """WG21 mailing release (mailing_date, title).""" + + mailing_date = models.CharField(max_length=7, unique=True, db_index=True) + title = models.CharField(max_length=255) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-mailing_date"] + verbose_name = "WG21 Mailing" + verbose_name_plural = "WG21 Mailings" + + def __str__(self): + return f"{self.mailing_date} ({self.title})" + + +class WG21Paper(models.Model): + """WG21 paper (paper_id, url, title, document_date, year, mailing, subgroup, is_downloaded).""" + + paper_id = models.CharField(max_length=255, db_index=True) + url = models.URLField(max_length=1024) + title = models.CharField(max_length=1024, db_index=True) + document_date = models.DateField(db_index=True, null=True, blank=True) + year = models.IntegerField(default=0, db_index=True) + mailing = models.ForeignKey( + WG21Mailing, + on_delete=models.CASCADE, + related_name="papers", + ) + subgroup = models.CharField(max_length=255, blank=True, db_index=True) + is_downloaded = models.BooleanField(default=False, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + unique_together = [["paper_id", "year"]] + ordering = ["-document_date", "-paper_id", "-year"] + verbose_name = "WG21 Paper" + verbose_name_plural = "WG21 Papers" + + def __str__(self): + return f"{self.paper_id}: {self.title[:60]}" + + +class WG21PaperAuthor(models.Model): + """Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile).""" + + paper = models.ForeignKey( + WG21Paper, + on_delete=models.CASCADE, + related_name="authors", + db_column="paper_id", + ) + profile = models.ForeignKey( + "cppa_user_tracker.WG21PaperAuthorProfile", + on_delete=models.CASCADE, + related_name="papers", + db_column="profile_id", + ) + author_order = models.PositiveIntegerField(null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = (("paper", "profile"),) + ordering = ["id"] + verbose_name = "WG21 Paper Author" + verbose_name_plural = "WG21 Paper Authors" + + def __str__(self): + return f"{self.paper.paper_id} - {self.profile.display_name}" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py new file mode 100644 index 00000000..a6bf41b7 --- /dev/null +++ b/wg21_paper_tracker/pipeline.py @@ -0,0 +1,370 @@ +""" +Pipeline for WG21 Paper Tracker. +Coordinates scraping and updating the database (metadata only; no file download or GCS). +""" + +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass, field +from datetime import date, datetime +from typing import Any, Optional + +from django.utils.dateparse import parse_date + +from wg21_paper_tracker.fetcher import ( + fetch_all_mailings, + fetch_papers_for_mailing, +) +from wg21_paper_tracker.models import WG21Mailing +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, +) + +logger = logging.getLogger(__name__) + +# WG21 mailing_date and typical CSV column (e.g. 2025-03, 2026-01) +_MAILING_DATE_LABEL_RE = re.compile(r"^\d{4}-\d{2}$") + + +def _normalize_mailing_date_label(label: str, *, field_name: str) -> str: + s = label.strip() + if not _MAILING_DATE_LABEL_RE.match(s): + raise ValueError( + f"Invalid {field_name} {label!r}; " + "expected YYYY-MM (e.g. 2025-03), same as WG21 / CSV mailing keys." + ) + return s + + +def _mailing_date_in_run_scope( + mailing_date: str, + *, + latest_date: str, + from_mailing_date: Optional[str], + to_mailing_date: Optional[str], +) -> bool: + """Whether a mailing key is selected for this run (before retry merge).""" + if from_mailing_date is None and to_mailing_date is None: + return mailing_date > latest_date + + if from_mailing_date is not None and mailing_date < from_mailing_date: + return False + if to_mailing_date is not None and mailing_date > to_mailing_date: + return False + if from_mailing_date is None and to_mailing_date is not None: + return mailing_date > latest_date + return True + + +def _format_priority(ext: str) -> int: + """Prefer adoc > html > ps > pdf when multiple formats exist for one paper_id.""" + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} + return priorities.get(ext.lower(), 100) + + +def _parse_mailing_year(m_info: dict) -> int: + """Return 4-digit year from the index mailing dict, or 0 if missing/invalid.""" + mailing_date = m_info["mailing_date"] + year_raw = m_info.get("year") + if not year_raw or not str(year_raw).strip(): + logger.warning( + "Mailing %s: year missing or empty, using 0 (fix later).", + mailing_date, + ) + return 0 + try: + year = int(str(year_raw).strip()[:4]) + except (ValueError, TypeError): + logger.warning( + "Mailing %s: year not parseable %r, using 0 (fix later).", + mailing_date, + year_raw, + ) + return 0 + if year <= 0 or year > datetime.now().year + 1: + logger.warning( + "Mailing %s: year invalid, using 0 (fix later).", + mailing_date, + ) + return 0 + return year + + +def _group_fetched_papers_by_id( + papers: list[dict[str, Any]], mailing_date: str +) -> dict[str, list[dict[str, Any]]]: + """Bucket fetcher rows by normalized paper_id.""" + papers_by_id: dict[str, list[dict[str, Any]]] = {} + for p in papers: + pid = (p.get("paper_id") or "").strip().lower() + if not pid: + logger.warning( + "Skipping paper entry without a paper_id in mailing %s: %r", + mailing_date, + p, + ) + continue + papers_by_id.setdefault(pid, []).append(p) + return papers_by_id + + +def _valid_paper_entries_for_id( + p_list: list[dict[str, Any]], pid: str, mailing_date: str +) -> list[dict[str, Any]]: + """Keep rows that have type, url, and title (all non-empty).""" + valid: list[dict[str, Any]] = [] + for p in p_list: + type_val = ( + (p.get("type") or "").strip() if isinstance(p.get("type"), str) else "" + ) + url_val = (p.get("url") or "").strip() if isinstance(p.get("url"), str) else "" + title_val = ( + (p.get("title") or "").strip() if isinstance(p.get("title"), str) else "" + ) + if not type_val or not url_val or not title_val: + logger.debug( + "Skipping malformed paper entry for %s in mailing %s: %r", + pid, + mailing_date, + p, + ) + continue + valid.append(p) + return valid + + +def _choose_best_format_entry( + valid_list: list[dict[str, Any]], +) -> dict[str, Any]: + """Pick one row by format priority (adoc first). Precondition: valid_list non-empty.""" + return min( + valid_list, + key=lambda x: _format_priority(str(x.get("type") or "").strip()), + ) + + +def _parse_scraped_document_date(doc_date_str: Any) -> Optional[date]: + if not doc_date_str: + return None + try: + return parse_date(str(doc_date_str).strip()) + except Exception as e: + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + return None + + +def _upsert_paper_from_scraped_row( + pid: str, + best_paper: dict[str, Any], + mailing_obj: WG21Mailing, + year: int, + mailing_date: str, +) -> Optional[str]: + """ + Create or update WG21Paper from the chosen fetcher row. + Returns the document URL if a **new** row was inserted, else None. + """ + url = (best_paper.get("url") or "").strip() + paper_title = (best_paper.get("title") or "").strip() + subgroup = (best_paper.get("subgroup") or "").strip() + authors = best_paper.get("authors") + if not isinstance(authors, list): + authors = [] + if not url or not paper_title: + logger.warning( + "Skipping paper %s in mailing %s due to missing required fields: %r", + pid, + mailing_date, + best_paper, + ) + return None + + doc_date = _parse_scraped_document_date(best_paper.get("document_date")) + _paper_obj, created = get_or_create_paper( + paper_id=pid, + url=url, + title=paper_title, + document_date=doc_date, + mailing=mailing_obj, + subgroup=subgroup, + author_names=authors, + year=year, + ) + return url if created else None + + +def _process_single_mailing(m_info: dict) -> list[str]: + """ + For one mailing from the index: normalize year, get/create WG21Mailing, + fetch paper rows from the site, upsert WG21Paper rows. + + Returns URLs for papers **newly created** in this run for this mailing. + """ + mailing_date = m_info["mailing_date"] + title = m_info["title"] + year = _parse_mailing_year(m_info) + mailing_obj, _ = get_or_create_mailing(mailing_date, title) + + papers = fetch_papers_for_mailing(str(year), mailing_date) + if not papers: + logger.info( + "Mailing %s: no papers found (anchor/table may be missing).", + mailing_date, + ) + return [] + + papers_by_id = _group_fetched_papers_by_id(papers, mailing_date) + new_urls: list[str] = [] + + for pid, p_list in papers_by_id.items(): + valid_list = _valid_paper_entries_for_id(p_list, pid, mailing_date) + if not valid_list: + logger.warning( + "Skipping paper %s in mailing %s: no valid entries (type, url, title)", + pid, + mailing_date, + ) + continue + best_paper = _choose_best_format_entry(valid_list) + url = _upsert_paper_from_scraped_row( + pid, best_paper, mailing_obj, year, mailing_date + ) + if url: + new_urls.append(url) + + return new_urls + + +@dataclass(frozen=True) +class TrackerPipelineResult: + """Result of run_tracker_pipeline: URLs for papers newly created in this run.""" + + new_paper_urls: tuple[str, ...] = field(default_factory=tuple) + + @property + def new_paper_count(self) -> int: + return len(self.new_paper_urls) + + +def run_tracker_pipeline( + *, + from_mailing_date: Optional[str] = None, + to_mailing_date: Optional[str] = None, +) -> TrackerPipelineResult: + """ + Run the WG21 tracker pipeline: scrape mailings, upsert papers in the DB. + Returns URLs for rows created in this run (for GitHub repository_dispatch). + + Mailing keys are ``YYYY-MM`` (WG21 / CSV style). Selection: + + - Neither ``from_mailing_date`` nor ``to_mailing_date``: process mailings with + ``mailing_date`` strictly newer than the latest ``WG21Mailing`` in the DB. + - ``from_mailing_date`` only: ``mailing_date >= from_mailing_date``. + - ``to_mailing_date`` only: ``mailing_date > latest_in_db`` and + ``mailing_date <= to_mailing_date`` (incremental runs capped at ``to``). + - Both: ``from_mailing_date <= mailing_date <= to_mailing_date`` (inclusive). + + ``from_mailing_date`` must not be lexicographically after ``to_mailing_date``. + """ + if from_mailing_date is not None: + from_mailing_date = _normalize_mailing_date_label( + from_mailing_date, field_name="from_mailing_date" + ) + if to_mailing_date is not None: + to_mailing_date = _normalize_mailing_date_label( + to_mailing_date, field_name="to_mailing_date" + ) + if ( + from_mailing_date is not None + and to_mailing_date is not None + and from_mailing_date > to_mailing_date + ): + raise ValueError( + f"from_mailing_date {from_mailing_date!r} is after " + f"to_mailing_date {to_mailing_date!r}." + ) + + # 1. Get latest mailing from DB + latest_mailing = ( + WG21Mailing.objects.exclude(mailing_date="unknown") + .order_by("-mailing_date") + .first() + ) + latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01" + + # 2. Fetch all mailings + all_mailings = fetch_all_mailings() + if not all_mailings: + logger.warning("No mailings found on WG21 site.") + return TrackerPipelineResult() + + # Filter mailings to process + new_mailings = [ + m + for m in all_mailings + if _mailing_date_in_run_scope( + m["mailing_date"], + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + ] + if from_mailing_date is None and to_mailing_date is None: + baseline_desc = f"latest_in_db={latest_date}" + else: + parts: list[str] = [] + if from_mailing_date is not None: + parts.append(f"from={from_mailing_date}") + if to_mailing_date is not None: + parts.append(f"to={to_mailing_date}") + if from_mailing_date is None: + parts.append(f"latest_in_db={latest_date}") + baseline_desc = ", ".join(parts) + + # Requeue incomplete mailings so transient failures get retried (not just the latest) + retry_dates = set( + WG21Mailing.objects.filter(papers__isnull=True).values_list( + "mailing_date", flat=True + ) + ) + if latest_mailing: + retry_dates.add(latest_mailing.mailing_date) + retry_dates = { + d + for d in retry_dates + if _mailing_date_in_run_scope( + d, + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + } + new_mailing_dates = set(m["mailing_date"] for m in new_mailings) + for current_m in all_mailings: + if ( + current_m["mailing_date"] in retry_dates + and current_m["mailing_date"] not in new_mailing_dates + ): + new_mailings.append(current_m) + + # Sort chronologically (oldest to newest) + new_mailings.sort(key=lambda x: x["mailing_date"]) + + logger.info( + "Pipeline: %s, all_mailings=%d, mailings_to_process=%s", + baseline_desc, + len(all_mailings), + [m["mailing_date"] for m in new_mailings], + ) + new_urls: list[str] = [] + for m_info in new_mailings: + new_urls.extend(_process_single_mailing(m_info)) + + return TrackerPipelineResult(new_paper_urls=tuple(new_urls)) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py new file mode 100644 index 00000000..983493a7 --- /dev/null +++ b/wg21_paper_tracker/services.py @@ -0,0 +1,186 @@ +""" +Database logic for WG21 Paper Tracker. +""" + +from __future__ import annotations + +from datetime import date +from typing import TYPE_CHECKING, Optional + +from django.db import IntegrityError, transaction + +from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + +if TYPE_CHECKING: + from cppa_user_tracker.models import WG21PaperAuthorProfile + + +def _normalize_year(year: int | str | None) -> int: + """Return a 4-digit year as int, or 0 if missing/invalid.""" + if year is None: + return 0 + if isinstance(year, int): + return year if 0 < year <= 9999 else 0 + s = str(year).strip()[:4] + return int(s) if s.isdigit() else 0 + + +@transaction.atomic +def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: + mailing, created = WG21Mailing.objects.get_or_create( + mailing_date=mailing_date, defaults={"title": title} + ) + if not created and mailing.title != title: + mailing.title = title + mailing.save(update_fields=["title", "updated_at"]) + return mailing, created + + +def get_or_create_paper( + paper_id: str, + url: str, + title: str, + document_date: date | None, + mailing: WG21Mailing, + subgroup: str = "", + author_names: Optional[list[str]] = None, + author_emails: Optional[list[str]] = None, + year: int | None = None, +) -> tuple[WG21Paper, bool]: + paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") + year_val = _normalize_year(year) + + def _update_paper(paper: WG21Paper) -> bool: + updated = False + if paper.url != url: + paper.url = url + updated = True + if paper.title != title: + paper.title = title + updated = True + if paper.document_date != document_date: + paper.document_date = document_date + updated = True + if paper.mailing_id != mailing.id: + paper.mailing = mailing + updated = True + if paper.subgroup != subgroup: + paper.subgroup = subgroup + updated = True + if paper.year != year_val: + paper.year = year_val + updated = True + if updated: + paper.save() + return updated + + try: + with transaction.atomic(): + if year_val > 0: + # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year + paper = WG21Paper.objects.filter( + paper_id=paper_id, year=year_val + ).first() + if paper: + _update_paper(paper) + created = False + else: + placeholder = WG21Paper.objects.filter( + paper_id=paper_id, year=0 + ).first() + if placeholder: + try: + placeholder.url = url + placeholder.title = title + placeholder.document_date = document_date + placeholder.mailing = mailing + placeholder.subgroup = subgroup + placeholder.year = year_val + placeholder.save() + paper = placeholder + created = False + except IntegrityError: + raise # Roll back this transaction; recovery runs below + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=year_val, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=0, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + if not created: + _update_paper(paper) + except IntegrityError: + # Placeholder promotion hit (paper_id, year_val) unique constraint; fetch and update canonical row + with transaction.atomic(): + paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() + if not paper: + raise + _update_paper(paper) + created = False + + if author_names: + if not created: + for author in paper.authors.all(): + author.delete() + emails = author_emails or [] + for i, name in enumerate(author_names): + email = emails[i] if i < len(emails) else None + profile, _ = get_or_create_wg21_paper_author_profile(name, email=email) + get_or_create_paper_author(paper, profile, i + 1) + + return paper, created + + +def get_or_create_paper_author( + paper: WG21Paper, + profile: WG21PaperAuthorProfile, + author_order: int, +) -> tuple[WG21PaperAuthor, bool]: + """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based). + Updates author_order on existing link if it differs. + """ + if not isinstance(author_order, int) or author_order <= 0: + raise ValueError("author_order must be a positive integer") + link, link_created = WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + defaults={"author_order": author_order}, + ) + if not link_created and link.author_order != author_order: + link.author_order = author_order + link.save(update_fields=["author_order"]) + return link, link_created + + +def mark_paper_downloaded(paper_id: str, year: int | None = None): + paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") + if year is None: + raise ValueError("year is required; pass 0 explicitly for placeholder papers") + year_val = _normalize_year(year) + WG21Paper.objects.filter( + paper_id=paper_id, + year=year_val, + ).update(is_downloaded=True) diff --git a/wg21_paper_tracker/tests/__init__.py b/wg21_paper_tracker/tests/__init__.py new file mode 100644 index 00000000..18e481d7 --- /dev/null +++ b/wg21_paper_tracker/tests/__init__.py @@ -0,0 +1 @@ +# Tests for wg21_paper_tracker app (excluding cloud_run_job). diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py new file mode 100644 index 00000000..a099ccb2 --- /dev/null +++ b/wg21_paper_tracker/tests/test_commands.py @@ -0,0 +1,138 @@ +"""Tests for wg21_paper_tracker management commands.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from django.core.management import call_command +from django.core.management.base import CommandError +from django.test.utils import override_settings + +from wg21_paper_tracker.pipeline import TrackerPipelineResult + + +CMD_NAME = "import_wg21_metadata_from_csv" +RUN_TRACKER_CMD = "run_wg21_paper_tracker" + + +def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): + """Command raises CommandError when CSV file does not exist.""" + csv_path = tmp_path / "nonexistent.csv" + assert not csv_path.exists() + + with pytest.raises(CommandError, match=r"File not found:"): + call_command(CMD_NAME, f"--csv-file={csv_path}") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_posts_dispatch_when_enabled(): + """run_wg21_paper_tracker sends repository_dispatch with papers URL list.""" + mock_resp = MagicMock() + mock_resp.ok = True + mock_resp.status_code = 204 + mock_resp.text = "" + + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult( + new_paper_urls=("https://open-std.org/a.pdf", "https://open-std.org/b.pdf") + ), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + return_value=mock_resp, + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="myorg/convert-repo", + WG21_GITHUB_DISPATCH_TOKEN="secret-token", + WG21_GITHUB_DISPATCH_EVENT_TYPE="wg21_papers_convert", + ): + call_command(RUN_TRACKER_CMD) + + m_post.assert_called_once() + assert m_post.call_args[0][0] == ( + "https://api.github.com/repos/myorg/convert-repo/dispatches" + ) + body = m_post.call_args[1]["json"] + assert body["event_type"] == "wg21_papers_convert" + assert body["client_payload"] == { + "papers": [ + "https://open-std.org/a.pdf", + "https://open-std.org/b.pdf", + ], + } + headers = m_post.call_args[1]["headers"] + assert headers["Authorization"] == "Bearer secret-token" + assert headers["Accept"] == "application/vnd.github+json" + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_no_new_papers(): + """No HTTP request when pipeline returns no new URLs.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_dispatch_disabled(): + """No HTTP request when WG21_GITHUB_DISPATCH_ENABLED is False.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(new_paper_urls=("https://x/y",)), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=False, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_from_date(): + """--from-date must be YYYY-MM.""" + with pytest.raises(CommandError, match="Invalid from_mailing_date"): + call_command(RUN_TRACKER_CMD, "--from-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-03", to_mailing_date=None) + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_to_date(): + with pytest.raises(CommandError, match="Invalid to_mailing_date"): + call_command(RUN_TRACKER_CMD, "--to-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_and_to_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-01", "--to-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-01", to_mailing_date="2025-03") diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py new file mode 100644 index 00000000..93b21588 --- /dev/null +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -0,0 +1,274 @@ +"""Tests for wg21_paper_tracker.fetcher.""" + +from unittest.mock import patch, MagicMock + +import requests +from bs4 import BeautifulSoup + +from wg21_paper_tracker.fetcher import ( + BASE_URL, + extract_paper_metadata_from_table_row, + fetch_all_mailings, + fetch_papers_for_mailing, +) + + +# --- fetch_all_mailings --- + + +def test_fetch_all_mailings_returns_empty_on_request_failure(): + """fetch_all_mailings returns [] when requests.get raises RequestException.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = requests.RequestException("network error") + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_returns_empty_on_http_error(): + """fetch_all_mailings returns [] when response.raise_for_status raises HTTPError.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.raise_for_status.side_effect = requests.HTTPError("404") + m.return_value = resp + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_parses_links(): + """fetch_all_mailings parses year/#mailingYYYY-MM links and returns mailings.""" + html = """ + <html><body> + <a href="2025/#mailing2025-01">2025-01 pre-meeting mailing</a> + <a href="2025/#mailing2025-02">2025-02 post-meeting mailing</a> + <a href="2024/#mailing2024-11">2024-11 mailing</a> + <a href="other">Ignore</a> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_all_mailings() + assert len(result) == 3 + assert result[0]["mailing_date"] == "2025-01" + assert result[0]["title"] == "2025-01 pre-meeting mailing" + assert result[0]["year"] == "2025" + assert result[1]["mailing_date"] == "2025-02" + assert result[2]["mailing_date"] == "2024-11" + assert result[2]["year"] == "2024" + + +def test_fetch_all_mailings_calls_index_url(): + """fetch_all_mailings calls BASE_URL/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock(text="<html></html>", raise_for_status=MagicMock()) + fetch_all_mailings() + m.assert_called_once_with(f"{BASE_URL}/", timeout=30) + + +# --- fetch_papers_for_mailing --- + + +def test_fetch_papers_for_mailing_returns_empty_on_request_failure(): + """fetch_papers_for_mailing returns [] when requests.get raises RequestException.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = requests.RequestException("timeout") + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_returns_empty_when_anchor_missing(): + """fetch_papers_for_mailing returns [] when mailing anchor is not found.""" + html = "<html><body><div id='other'>x</div></body></html>" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_finds_anchor_by_id(): + """fetch_papers_for_mailing finds anchor by id=mailingYYYY-MM.""" + html = """ + <html><body> + <span id="mailing2025-01"></span> + <table> + <tr><td><a href="p1000r0.pdf">p1000r0.pdf</a></td><td>Title</td><td>Author</td><td>2025-01-15</td><td>SG1</td></tr> + </table> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "p1000r0" + assert result[0]["filename"] == "p1000r0.pdf" + assert result[0]["title"] == "Title" + assert result[0]["authors"] == ["Author"] + assert result[0]["document_date"] == "2025-01-15" + assert result[0]["subgroup"] == "SG1" + + +def test_fetch_papers_for_mailing_finds_anchor_by_name(): + """fetch_papers_for_mailing finds anchor by name= when id is missing.""" + html = """ + <html><body> + <a name="mailing2025-01"></a> + <table> + <tr><td><a href="n5034.html">n5034.html</a></td><td>Draft</td></tr> + </table> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "n5034" + assert result[0]["type"] == "html" + + +def test_fetch_papers_for_mailing_normalizes_paper_id_lowercase(): + """fetch_papers_for_mailing returns paper_id in lowercase.""" + html = """ + <html><body> + <span id="mailing2025-01"></span> + <table> + <tr><td><a href="P3039R1.PDF">P3039R1.PDF</a></td></tr> + </table> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result[0]["paper_id"] == "p3039r1" + assert result[0]["filename"] == "p3039r1.pdf" + + +def test_fetch_papers_for_mailing_returns_empty_when_no_table(): + """fetch_papers_for_mailing returns [] when no table follows anchor.""" + html = """ + <html><body> + <span id="mailing2025-01"></span> + <p>No table here</p> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_does_not_use_next_mailings_table(): + """First mailing with no table returns []; second mailing's table is not used.""" + html = """ + <html><body> + <h2 id="mailing2025-02">2025-02</h2> + <p>No papers this month.</p> + <h2 id="mailing2025-01">2025-01</h2> + <table> + <tr><td><a href="p1234r1.pdf">p1234r1.pdf</a></td><td>Paper</td><td>A. Author</td><td>2025-01-10</td><td>SG1</td></tr> + </table> + </body></html> + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + first = fetch_papers_for_mailing("2025", "2025-02") + second = fetch_papers_for_mailing("2025", "2025-01") + assert first == [], "2025-02 has no table; must not attribute 2025-01's table" + assert len(second) == 1 + assert second[0]["paper_id"] == "p1234r1" + + +def test_fetch_papers_for_mailing_calls_year_url(): + """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock( + text="<html><body><span id='mailing2025-01'></span></body></html>", + raise_for_status=MagicMock(), + ) + fetch_papers_for_mailing("2025", "2025-01") + m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) + + +# --- extract_paper_metadata_from_table_row --- + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_cells(): + """Empty cell list yields no paper.""" + assert extract_paper_metadata_from_table_row([], f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_paper_link(): + """Row without a matching paper href returns None.""" + html = "<tr><td>No link here</td><td>t</td></tr>" + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + assert extract_paper_metadata_from_table_row(cells, f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_parses_legacy_five_column_row(): + """Older tables: Number, Title, Author, Document date, Subgroup (subgroup at index 4).""" + html = """ + <tr> + <td><a href="p1234r0.pdf">P1234R0</a></td> + <td>My title</td> + <td>Author One, Author Two</td> + <td>2025-03-15</td> + <td>LEWG</td> + </tr> + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2025/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1234r0" + assert result["type"] == "pdf" + assert result["filename"] == "p1234r0.pdf" + assert result["url"] == f"{BASE_URL}/2025/p1234r0.pdf" + assert result["title"] == "My title" + assert result["authors"] == ["Author One", "Author Two"] + assert result["document_date"] == "2025-03-15" + assert result["subgroup"] == "LEWG" + + +def test_extract_paper_metadata_from_table_row_parses_eight_column_row(): + """2026+ style: subgroup is column 7 (index 6), not index 4 (mailing date).""" + html = """ + <tr> + <td><a href="../2026/p1000r7.pdf">P1000R7</a></td> + <td>C++ IS Schedule (proposed)</td> + <td>Herb Sutter</td> + <td>2026-01-13</td> + <td>2026-01</td> + <td><a href="../2024/p1000r6.pdf">P1000R6</a></td> + <td>All of WG21</td> + <td></td> + </tr> + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2026/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1000r7" + assert result["document_date"] == "2026-01-13" + assert result["subgroup"] == "All of WG21" diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py new file mode 100644 index 00000000..9b4ee7e8 --- /dev/null +++ b/wg21_paper_tracker/tests/test_models.py @@ -0,0 +1,86 @@ +"""Tests for wg21_paper_tracker.models.""" + +from datetime import date + +import pytest +from django.db import IntegrityError, transaction + +from wg21_paper_tracker.models import WG21Mailing, WG21Paper + + +@pytest.mark.django_db +def test_wg21_mailing_str(): + """WG21Mailing.__str__ returns mailing_date and title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="2025-01 pre-meeting") + assert str(m) == "2025-01 (2025-01 pre-meeting)" + + +@pytest.mark.django_db +def test_wg21_paper_str(): + """WG21Paper.__str__ returns paper_id and truncated title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + p = WG21Paper.objects.create( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="A short title", + document_date=date(2025, 1, 15), + mailing=m, + year=2025, + ) + assert "p1000r0" in str(p) + assert "A short title" in str(p) + + +@pytest.mark.django_db +def test_wg21_paper_str_truncates_long_title(): + """WG21Paper.__str__ truncates title to 60 chars.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + long_title = "x" * 100 + p = WG21Paper.objects.create( + paper_id="p1", + url="https://example.com/p.pdf", + title=long_title, + mailing=m, + year=2025, + ) + assert len(str(p).split(": ", 1)[-1]) <= 60 + + +@pytest.mark.django_db +def test_wg21_mailing_ordering(): + """WG21Mailing default ordering is by mailing_date descending.""" + WG21Mailing.objects.create(mailing_date="2025-01", title="A") + WG21Mailing.objects.create(mailing_date="2025-02", title="B") + dates = list(WG21Mailing.objects.values_list("mailing_date", flat=True)) + assert dates == ["2025-02", "2025-01"] + + +@pytest.mark.django_db +def test_wg21_paper_unique_together_paper_id_year(): + """WG21Paper allows same paper_id with different year; rejects duplicate (paper_id, year).""" + m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1") + m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2") + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/1.pdf", + title="T1", + mailing=m1, + year=2024, + ) + with pytest.raises(IntegrityError): + with transaction.atomic(): + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/dup.pdf", + title="T1 dup", + mailing=m1, + year=2024, + ) + p2 = WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/2.pdf", + title="T2", + mailing=m2, + year=2025, + ) + assert p2.pk is not None diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py new file mode 100644 index 00000000..099f105b --- /dev/null +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -0,0 +1,221 @@ +"""Tests for wg21_paper_tracker.pipeline.""" + +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.pipeline import TrackerPipelineResult, run_tracker_pipeline + + +# --- run_tracker_pipeline --- + + +@pytest.mark.django_db +def test_run_tracker_pipeline_returns_empty_when_no_mailings(): + """run_tracker_pipeline returns empty result when fetch_all_mailings returns [].""" + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]): + result = run_tracker_pipeline() + assert result.new_paper_count == 0 + assert result.new_paper_urls == () + + +@pytest.mark.django_db +def test_run_tracker_pipeline_skips_when_no_new_mailings(): + """run_tracker_pipeline returns empty when all mailings are <= latest in DB.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m: + m.return_value = [ + {"mailing_date": "2025-01", "title": "Old", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest", "year": "2025"}, + ] + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[] + ): + result = run_tracker_pipeline() + assert result.new_paper_count == 0 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_collects_urls_for_new_papers(): + """run_tracker_pipeline returns URLs for papers created in this run.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-01", "title": "Previous", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + result = run_tracker_pipeline() + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1000r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_from_mailing_date_backfills_older_than_db_latest(): + """from_mailing_date includes mailings >= date even when DB latest is newer.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Older", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest in DB", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1111r0", + "url": "https://example.com/p1111r0.pdf", + "filename": "p1111r0.pdf", + "title": "January paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + result = run_tracker_pipeline(from_mailing_date="2025-01") + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1111r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_second_run_no_new_urls(): + """Existing papers do not add URLs on a subsequent run.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + first = run_tracker_pipeline() + second = run_tracker_pipeline() + assert first.new_paper_count == 1 + assert second.new_paper_count == 0 + + +def test_tracker_pipeline_result_count(): + """TrackerPipelineResult.new_paper_count matches tuple length.""" + r = TrackerPipelineResult(new_paper_urls=("a", "b")) + assert r.new_paper_count == 2 + + +def test_run_tracker_pipeline_rejects_bad_from_mailing_date(): + """from_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid from_mailing_date"): + run_tracker_pipeline(from_mailing_date="not-valid") + + +def test_run_tracker_pipeline_rejects_bad_to_mailing_date(): + """to_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid to_mailing_date"): + run_tracker_pipeline(to_mailing_date="not-valid") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_rejects_from_after_to(): + with pytest.raises(ValueError, match="after"): + run_tracker_pipeline(from_mailing_date="2025-03", to_mailing_date="2025-01") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_mailing_date_caps_inclusive_range(): + """With from and to, mailings outside [from, to] are skipped.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-03", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Too early", "year": "2025"}, + {"mailing_date": "2025-02", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-03", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-04", "title": "Too late", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p2222r0", + "url": "https://example.com/p2222r0.pdf", + "filename": "p2222r0.pdf", + "title": "Feb", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline( + from_mailing_date="2025-02", to_mailing_date="2025-03" + ) + assert result.new_paper_count == 1 + assert fetch.call_count == 2 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_only_caps_incremental_above_latest(): + """to_mailing_date without from: still require mailing_date > latest_in_db.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Latest") + mailings = [ + {"mailing_date": "2025-01", "title": "Latest", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + {"mailing_date": "2025-03", "title": "Too new for cap", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p3333r0", + "url": "https://example.com/p3333r0.pdf", + "filename": "p3333r0.pdf", + "title": "A", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline(to_mailing_date="2025-02") + assert result.new_paper_count == 1 + assert fetch.call_count == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py new file mode 100644 index 00000000..bd3a3e29 --- /dev/null +++ b/wg21_paper_tracker/tests/test_services.py @@ -0,0 +1,260 @@ +"""Tests for wg21_paper_tracker.services.""" + +from datetime import date +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, + mark_paper_downloaded, +) + + +# --- get_or_create_mailing --- + + +@pytest.mark.django_db +def test_get_or_create_mailing_creates_new(): + """get_or_create_mailing creates new mailing and returns (mailing, True).""" + m, created = get_or_create_mailing("2025-01", "2025-01 pre-meeting mailing") + assert created is True + assert m.mailing_date == "2025-01" + assert m.title == "2025-01 pre-meeting mailing" + + +@pytest.mark.django_db +def test_get_or_create_mailing_gets_existing(): + """get_or_create_mailing returns existing mailing and (mailing, False).""" + get_or_create_mailing("2025-01", "Original title") + m2, created2 = get_or_create_mailing("2025-01", "Updated title") + assert created2 is False + assert m2.mailing_date == "2025-01" + assert m2.title == "Updated title" # title is updated when different + + +@pytest.mark.django_db +def test_get_or_create_mailing_updates_title_when_different(): + """get_or_create_mailing updates title when existing has different title.""" + get_or_create_mailing("2025-02", "Old title") + m, _ = get_or_create_mailing("2025-02", "New title") + m.refresh_from_db() + assert m.title == "New title" + + +# --- get_or_create_paper --- + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +def test_get_or_create_paper_creates_new(mock_profile, db): + """get_or_create_paper creates new paper and returns (paper, True).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=date(2025, 1, 15), + mailing=mailing, + subgroup="SG1", + author_names=None, + year=2025, + ) + assert created is True + assert paper.paper_id == "p1000r0" + assert paper.title == "A paper" + assert paper.year == 2025 + assert paper.mailing_id == mailing.id + assert paper.subgroup == "SG1" + mock_profile.assert_not_called() + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +@patch("wg21_paper_tracker.services.get_or_create_paper_author") +def test_get_or_create_paper_calls_author_profile_for_each_author( + mock_get_or_create_paper_author, mock_profile, db +): + """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author.""" + from unittest.mock import MagicMock + + alice_profile = MagicMock() + alice_profile.pk = 1 + bob_profile = MagicMock() + bob_profile.pk = 2 + mock_profile.side_effect = [ + (alice_profile, True), + (bob_profile, True), + ] + mock_get_or_create_paper_author.return_value = (MagicMock(), True) + + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=None, + mailing=mailing, + author_names=["Alice", "Bob"], + year=2025, + ) + assert created is True + assert mock_profile.call_count == 2 + mock_profile.assert_any_call("Alice", email=None) + mock_profile.assert_any_call("Bob", email=None) + assert mock_get_or_create_paper_author.call_count == 2 + mock_get_or_create_paper_author.assert_any_call(paper, alice_profile, 1) + mock_get_or_create_paper_author.assert_any_call(paper, bob_profile, 2) + + +@pytest.mark.django_db +def test_get_or_create_paper_normalizes_paper_id_lowercase(db): + """get_or_create_paper stores paper_id in lowercase.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id=" P3039R1 ", + url="https://example.com/p3039r1.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.paper_id == "p3039r1" + + +@pytest.mark.django_db +def test_get_or_create_paper_gets_existing_and_updates(db): + """get_or_create_paper returns existing and updates fields when different.""" + mailing1, _ = get_or_create_mailing("2025-01", "M1") + mailing2, _ = get_or_create_mailing("2025-02", "M2") + get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/old.pdf", + title="Old title", + document_date=date(2025, 1, 1), + mailing=mailing1, + subgroup="SG1", + year=2025, + ) + paper2, created2 = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/new.pdf", + title="New title", + document_date=date(2025, 2, 1), + mailing=mailing2, + subgroup="SG2", + year=2025, + ) + assert created2 is False + paper2.refresh_from_db() + assert paper2.url == "https://example.com/new.pdf" + assert paper2.title == "New title" + assert paper2.mailing_id == mailing2.id + assert paper2.subgroup == "SG2" + + +@pytest.mark.django_db +def test_get_or_create_paper_year_none_stored_as_zero(db): + """get_or_create_paper with year=None stores 0 so records can be updated later.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="n5034", + url="https://example.com/n5034.html", + title="Draft", + document_date=None, + mailing=mailing, + year=None, + ) + assert paper.year == 0 + + +@pytest.mark.django_db +def test_get_or_create_paper_same_paper_id_different_year_creates_two(db): + """get_or_create_paper creates separate rows for same paper_id different year (unique_together).""" + mailing1, _ = get_or_create_mailing("2024-11", "M1") + mailing2, _ = get_or_create_mailing("2025-01", "M2") + p1, c1 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2024.pdf", + title="SD 2024", + document_date=None, + mailing=mailing1, + year=2024, + ) + p2, c2 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2025.pdf", + title="SD 2025", + document_date=None, + mailing=mailing2, + year=2025, + ) + assert c1 is True and c2 is True + assert p1.pk != p2.pk + assert p1.year == 2024 and p2.year == 2025 + + +@pytest.mark.django_db +def test_get_or_create_paper_sets_author_order(db): + """get_or_create_paper sets author_order (1-based) on WG21PaperAuthor links.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p9999", + url="https://example.com/p9999.pdf", + title="Multi-author paper", + document_date=None, + mailing=mailing, + author_names=["First Author", "Second Author", "Third Author"], + year=2025, + ) + links = list(paper.authors.order_by("author_order")) + assert len(links) == 3 + assert links[0].author_order == 1 + assert links[1].author_order == 2 + assert links[2].author_order == 3 + + +# --- mark_paper_downloaded --- + + +@pytest.mark.django_db +def test_mark_paper_downloaded_requires_year(db): + """mark_paper_downloaded raises ValueError when year is omitted.""" + with pytest.raises(ValueError, match="year is required"): + mark_paper_downloaded("p1000r0") + + +@pytest.mark.django_db +def test_mark_paper_downloaded_sets_flag(db): + """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.is_downloaded is False + mark_paper_downloaded("p1000r0", year=2025) + paper.refresh_from_db() + assert paper.is_downloaded is True + + +@pytest.mark.django_db +def test_mark_paper_downloaded_normalizes_paper_id(db): + """mark_paper_downloaded matches case-insensitively (normalizes to lower) and by year.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + mark_paper_downloaded(" P1000R0 ", year=2025) + paper.refresh_from_db() + assert paper.is_downloaded is True diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py new file mode 100644 index 00000000..09986dff --- /dev/null +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -0,0 +1,86 @@ +"""Tests for wg21_paper_tracker.workspace.""" + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.workspace import get_workspace_root, get_raw_dir + + +@pytest.fixture +def mock_workspace_path(tmp_path): + """Patch get_workspace_path to return tmp_path for app slugs.""" + + def _get_path(app_slug): + p = tmp_path / app_slug.replace("/", "_") + p.mkdir(parents=True, exist_ok=True) + return p + + with patch( + "wg21_paper_tracker.workspace.get_workspace_path", + side_effect=_get_path, + ): + yield tmp_path + + +def test_get_workspace_root_returns_path(mock_workspace_path): + """get_workspace_root returns Path for app workspace.""" + root = get_workspace_root() + assert "wg21_paper_tracker" in str(root) + assert root.is_dir() + + +def test_get_workspace_root_calls_get_workspace_path_with_slug(): + """get_workspace_root calls get_workspace_path with app slug.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + m.return_value = Path("/fake/workspace/wg21_paper_tracker") + root = get_workspace_root() + m.assert_called_once_with("wg21_paper_tracker") + assert root == Path("/fake/workspace/wg21_paper_tracker") + + +def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): + """get_raw_dir returns RAW_DIR/wg21_paper_tracker/<year>/<mailing_date>/.""" + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path + path = get_raw_dir("2025-01", 2025) + expected = mock_workspace_path / "wg21_paper_tracker" / "2025" / "2025-01" + assert path == expected + assert path.is_dir() + + +def test_get_raw_dir_creates_parents(mock_workspace_path): + """get_raw_dir creates parent directories.""" + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path + path = get_raw_dir("2026-02", 2026) + assert path.exists() + assert path.parent.name == "2026" + assert path.name == "2026-02" + + +def test_get_raw_dir_idempotent(mock_workspace_path): + """get_raw_dir can be called twice for same mailing_date without error.""" + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path + p1 = get_raw_dir("2025-01", 2025) + p2 = get_raw_dir("2025-01", 2025) + assert p1 == p2 + assert p1.parent == p2.parent + + +def test_get_raw_dir_rejects_invalid_mailing_date(): + """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.).""" + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("../../tmp", 2025) + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025", 2025) + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-1", 2025) + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-13", 2025) + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-00", 2025) + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("", 2025) diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py new file mode 100644 index 00000000..62ec55ef --- /dev/null +++ b/wg21_paper_tracker/workspace.py @@ -0,0 +1,36 @@ +""" +Workspace paths for wg21_paper_tracker. +Temporary file storage during download before uploading to GCS. +""" + +import re +from pathlib import Path + +from django.conf import settings + +from config.workspace import get_workspace_path + +_APP_SLUG = "wg21_paper_tracker" +_RAW_APP_SLUG = f"raw/{_APP_SLUG}" +_MAILING_DATE_RE = re.compile(r"^\d{4}-(0[1-9]|1[0-2])$") + + +def get_workspace_root() -> Path: + return get_workspace_path(_APP_SLUG) + + +def get_raw_dir(mailing_date: str | None, year: int) -> Path: + """Return workspace/raw/wg21_paper_tracker/<year>/<mailing_date>/; creates if missing.""" + if mailing_date is not None and not _MAILING_DATE_RE.fullmatch(mailing_date): + raise ValueError("mailing_date must be in YYYY-MM format") + if getattr(settings, "RAW_DIR", None): + raw_root = Path(settings.RAW_DIR) / _APP_SLUG + else: + raw_root = get_workspace_path(_RAW_APP_SLUG) + raw_root.mkdir(parents=True, exist_ok=True) + if mailing_date: + path = raw_root / str(year) / mailing_date + else: + path = raw_root / str(year) + path.mkdir(parents=True, exist_ok=True) + return path diff --git a/workflow/management/commands/run_all_collectors.py b/workflow/management/commands/run_all_collectors.py index 41f5539d..a3d3e5e3 100644 --- a/workflow/management/commands/run_all_collectors.py +++ b/workflow/management/commands/run_all_collectors.py @@ -22,6 +22,8 @@ "run_boost_mailing_list_tracker", "run_clang_github_tracker", "run_discord_exporter", + "run_wg21_paper_tracker", + "run_cppa_youtube_script_tracker", ]