From e3c1a65d536f0af523bbf4ade6936163e4cee4db Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 09:23:21 -0700 Subject: [PATCH 01/76] Initial wg21-paper-tracker added --- config/settings.py | 8 + .../0005_alter_slackuser_slack_user_id.py | 18 ++ cppa_user_tracker/services.py | 27 +- docs/Schema.md | 20 +- docs/operations/WG21_Cloud_Run.md | 61 ++++ parse_test.py | 8 + parse_year.py | 20 ++ requirements.txt | 6 + wg21_paper_tracker/__init__.py | 0 wg21_paper_tracker/admin.py | 31 +++ wg21_paper_tracker/apps.py | 7 + wg21_paper_tracker/cloud_run_job/Dockerfile | 24 ++ .../cloud_run_job/converters/__init__.py | 9 + .../converters/docling_converter.py | 62 +++++ .../converters/openai_converter.py | 260 ++++++++++++++++++ .../converters/pdfplumber_converter.py | 92 +++++++ wg21_paper_tracker/cloud_run_job/main.py | 100 +++++++ .../cloud_run_job/requirements.txt | 6 + wg21_paper_tracker/fetcher.py | 156 +++++++++++ wg21_paper_tracker/management/__init__.py | 0 .../management/commands/__init__.py | 0 .../commands/run_wg21_paper_tracker.py | 56 ++++ wg21_paper_tracker/migrations/0001_initial.py | 69 +++++ wg21_paper_tracker/migrations/__init__.py | 0 wg21_paper_tracker/models.py | 76 +++++ wg21_paper_tracker/pipeline.py | 174 ++++++++++++ wg21_paper_tracker/services.py | 77 ++++++ wg21_paper_tracker/workspace.py | 23 ++ .../management/commands/run_all_collectors.py | 1 + 29 files changed, 1385 insertions(+), 6 deletions(-) create mode 100644 cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py create mode 100644 docs/operations/WG21_Cloud_Run.md create mode 100644 parse_test.py create mode 100644 parse_year.py create mode 100644 wg21_paper_tracker/__init__.py create mode 100644 wg21_paper_tracker/admin.py create mode 100644 wg21_paper_tracker/apps.py create mode 100644 wg21_paper_tracker/cloud_run_job/Dockerfile create mode 100644 wg21_paper_tracker/cloud_run_job/converters/__init__.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/docling_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/openai_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/main.py create mode 100644 wg21_paper_tracker/cloud_run_job/requirements.txt create mode 100644 wg21_paper_tracker/fetcher.py create mode 100644 wg21_paper_tracker/management/__init__.py create mode 100644 wg21_paper_tracker/management/commands/__init__.py create mode 100644 wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py create mode 100644 wg21_paper_tracker/migrations/0001_initial.py create mode 100644 wg21_paper_tracker/migrations/__init__.py create mode 100644 wg21_paper_tracker/models.py create mode 100644 wg21_paper_tracker/pipeline.py create mode 100644 wg21_paper_tracker/services.py create mode 100644 wg21_paper_tracker/workspace.py diff --git a/config/settings.py b/config/settings.py index d45b438..ae4a50f 100644 --- a/config/settings.py +++ b/config/settings.py @@ -48,6 +48,7 @@ "cppa_slack_transcript_tracker", "cppa_slack_tracker", "discord_activity_tracker", + "wg21_paper_tracker", ] MIDDLEWARE = [ @@ -140,6 +141,7 @@ "cppa_slack_tracker", "discord_activity_tracker", "boost_mailing_list_tracker", + "wg21_paper_tracker", "shared", ) WORKSPACE_DIR.mkdir(parents=True, exist_ok=True) @@ -214,6 +216,12 @@ ) ).resolve() +# WG21 Paper Tracker Configuration +WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() +GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() +GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() +WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "").strip() + # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) LOG_FILE = env("LOG_FILE", default="app.log") diff --git a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py new file mode 100644 index 0000000..f1cde2c --- /dev/null +++ b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.28 on 2026-03-09 15:35 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('cppa_user_tracker', '0004_alter_slackuser_slack_user_id_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='slackuser', + name='slack_user_id', + field=models.CharField(max_length=64, unique=True), + ), + ] diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index a583894..35503f4 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -26,6 +26,7 @@ MailingListProfile, SlackUser, DiscordProfile, + WG21PaperAuthorProfile, ) @@ -49,7 +50,9 @@ def get_or_create_identity( """Get or create an Identity by display_name. If exists, updates description from defaults.""" lookup = {"display_name": display_name} defaults = defaults or {"description": description} - identity, created = Identity.objects.get_or_create(defaults=defaults, **lookup) + identity, created = Identity.objects.get_or_create( + defaults=defaults, **lookup + ) if ( not created and "description" in defaults @@ -247,7 +250,9 @@ def _get_next_negative_github_account_id() -> int: @transaction.atomic -def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool]: +def get_or_create_slack_user( + user_data: dict[str, Any], +) -> tuple[SlackUser, bool]: """Get or create a SlackUser from Slack API user data. Returns (SlackUser, created). If the user exists, updates username, display_name, and avatar_url from user_data. @@ -260,7 +265,9 @@ def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool raise ValueError("Slack user ID ('id') is required") profile = user_data.get("profile") or {} username = (user_data.get("name") or "").strip() - display_name = (user_data.get("real_name") or user_data.get("name") or "").strip() + display_name = ( + user_data.get("real_name") or user_data.get("name") or "" + ).strip() avatar_url = (profile.get("image_72") or "").strip() user, created = SlackUser.objects.get_or_create( slack_user_id=user_id, @@ -303,7 +310,9 @@ def get_or_create_unknown_github_account( ).first() if existing is not None: if email_str and not existing.emails.filter(email=email_str).exists(): - add_email(existing, email_str, is_primary=not existing.emails.exists()) + add_email( + existing, email_str, is_primary=not existing.emails.exists() + ) return existing, False next_id = _get_next_negative_github_account_id() account = get_or_create_github_account( @@ -350,3 +359,13 @@ def get_or_create_discord_profile( profile.is_bot = is_bot profile.save() return profile, created + + +def get_or_create_wg21_paper_author_profile( + display_name: str, +) -> tuple[Any, bool]: + """Get or create a WG21PaperAuthorProfile by display_name.""" + display_name_val = (display_name or "").strip() + return WG21PaperAuthorProfile.objects.get_or_create( + display_name=display_name_val, + ) diff --git a/docs/Schema.md b/docs/Schema.md index 3f9fa87..308a662 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -611,6 +611,7 @@ erDiagram erDiagram Direction LR WG21PaperAuthorProfile ||--o{ WG21PaperAuthor : "author" + WG21Mailing ||--o{ WG21Paper : "has" WG21PaperAuthor }o--|| WG21Paper : "has" WG21PaperAuthor { @@ -620,12 +621,23 @@ erDiagram datetime created_at } + WG21Mailing { + int id PK + string mailing_date UK "IX" + string title + datetime created_at + datetime updated_at + } + WG21Paper { int id PK string paper_id UK "IX" string url string title "IX" - date publication_date "IX" + date document_date "IX" + int mailing_id FK "IX" + string subgroup "IX" + boolean is_downloaded "IX" datetime created_at datetime updated_at } @@ -633,6 +645,8 @@ erDiagram **Note:** **WG21PaperAuthorProfile** extends `BaseProfile` (section 1). `profile_id` in WG21PaperAuthor references this profile; each paper can have multiple authors. +**Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing. + **Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. --- @@ -746,7 +760,8 @@ erDiagram | **SlackMessage** | Message in a channel (ts, slack_user_id, message, thread_ts). | 6 | | **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | | **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | -| **WG21Paper** | WG21 paper (paper_id, url, title, publication_date). | 7 | +| **WG21Mailing** | WG21 mailing release (mailing_date, title). | 7 | +| **WG21Paper** | WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded). | 7 | | **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | | **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | | **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | @@ -790,5 +805,6 @@ erDiagram | SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | | SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | | SlackChannel | SlackUser | Creator (many-to-one) | +| WG21Mailing | WG21Paper | Has many papers | | WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | | WG21Paper | WG21PaperAuthor | Has many authors | diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md new file mode 100644 index 0000000..257e2bc --- /dev/null +++ b/docs/operations/WG21_Cloud_Run.md @@ -0,0 +1,61 @@ +# WG21 Paper Conversion Cloud Run Job + +The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. + +The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job via the Google Cloud Run API when new papers are downloaded. + +## 1. Setup Google Cloud Storage + +Create a GCS bucket (e.g., `wg21-data-collector`). + +Ensure your Django app has the following environment variables configured: +- `WG21_GCS_BUCKET`: The name of the GCS bucket. +- `GCP_PROJECT_ID`: Your GCP project ID. +- `WG21_CLOUD_RUN_JOB_NAME`: (Optional, defaults to `wg21-convert`) The name of the deployed Cloud Run job. +- `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. + +## 2. Build and Push the Docker Image + +Navigate to the Cloud Run job directory: + +```bash +cd wg21_paper_tracker/cloud_run_job/ +``` + +Build the Docker image. Replace `[PROJECT_ID]` with your GCP Project ID: + +```bash +docker build -t gcr.io/[PROJECT_ID]/wg21-convert . +``` + +Push the image to Google Container Registry (or Artifact Registry): + +```bash +docker push gcr.io/[PROJECT_ID]/wg21-convert +``` + +## 3. Create the Cloud Run Job + +Create the job in Google Cloud. We recommend allocating sufficient memory and CPU since Docling and PDFPlumber are resource-intensive. + +```bash +gcloud run jobs create wg21-convert \ + --image gcr.io/[PROJECT_ID]/wg21-convert \ + --memory 8Gi \ + --cpu 4 \ + --region us-central1 \ + --set-env-vars WG21_GCS_BUCKET=wg21-data-collector,OPENROUTER_API_KEY=your_key +``` + +## 4. Service Account & IAM Permissions + +1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. +2. **GCS Access:** Both the Django application and the Cloud Run job require read/write access to the GCS bucket (`roles/storage.objectAdmin`). + +## 5. Flow Summary + +1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. +2. It checks the WG21 site for new mailings. +3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_papers//`. +4. It calls the Cloud Run API to execute `wg21-convert`. +5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/parse_test.py b/parse_test.py new file mode 100644 index 0000000..19dd034 --- /dev/null +++ b/parse_test.py @@ -0,0 +1,8 @@ +import re +text = """- [2026-01 mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-01) +- [2026-02 pre-Croydon mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-02) +- [2026](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/) N5034-N????""" + +pattern = re.compile(r'\[([^\]]+)\]\([^#]+#mailing(\d{4}-\d{2})\)') +for m in pattern.finditer(text): + print(m.groups()) diff --git a/parse_year.py b/parse_year.py new file mode 100644 index 0000000..1f6277f --- /dev/null +++ b/parse_year.py @@ -0,0 +1,20 @@ +import requests +from bs4 import BeautifulSoup + +response = requests.get('https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/') +soup = BeautifulSoup(response.text, 'html.parser') + +anchor = soup.find('a', id='mailing2026-02') +if not anchor: + print("Anchor not found") +else: + table = anchor.find_next('table') + rows = table.find_all('tr') + print(f"Found {len(rows)} rows in table after anchor") + for row in rows[:3]: + cells = [c.text.strip() for c in row.find_all(['th', 'td'])] + print(cells) + # Also print links in first cell + if row.find('td'): + links = row.find_all('td')[0].find_all('a') + print("Links:", [l['href'] for l in links]) diff --git a/requirements.txt b/requirements.txt index 1b3f84d..289d486 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,9 @@ redis>=5.0 slack-bolt>=1.18 pytz>=2024.1 selenium>=4.35 + +# wg21_paper_tracker app +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +google-cloud-run>=0.10.1 +google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/__init__.py b/wg21_paper_tracker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py new file mode 100644 index 0000000..a22358d --- /dev/null +++ b/wg21_paper_tracker/admin.py @@ -0,0 +1,31 @@ +from django.contrib import admin +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@admin.register(WG21Mailing) +class WG21MailingAdmin(admin.ModelAdmin): + list_display = ("mailing_date", "title", "created_at", "updated_at") + search_fields = ("mailing_date", "title") + ordering = ("-mailing_date",) + + +class WG21PaperAuthorInline(admin.TabularInline): + model = WG21PaperAuthor + extra = 1 + raw_id_fields = ("profile",) + + +@admin.register(WG21Paper) +class WG21PaperAdmin(admin.ModelAdmin): + list_display = ("paper_id", "title", "document_date", "mailing", "subgroup", "is_downloaded") + search_fields = ("paper_id", "title", "url", "subgroup") + list_filter = ("is_downloaded", "subgroup", "mailing") + ordering = ("-document_date", "-paper_id") + inlines = [WG21PaperAuthorInline] + + +@admin.register(WG21PaperAuthor) +class WG21PaperAuthorAdmin(admin.ModelAdmin): + list_display = ("paper", "profile", "created_at") + search_fields = ("paper__paper_id", "profile__display_name") + raw_id_fields = ("paper", "profile") diff --git a/wg21_paper_tracker/apps.py b/wg21_paper_tracker/apps.py new file mode 100644 index 0000000..d6f09d9 --- /dev/null +++ b/wg21_paper_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class Wg21PaperTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "wg21_paper_tracker" + verbose_name = "WG21 Paper Tracker" diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile new file mode 100644 index 0000000..21b51ef --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/Dockerfile @@ -0,0 +1,24 @@ +# Use an official Python runtime as a parent image +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies required by converters (e.g. Poppler for PDF image extraction) +RUN apt-get update && apt-get install -y --no-install-recommends \ + poppler-utils \ + libgl1-mesa-glx \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY . . + +# Run the main script +CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/__init__.py b/wg21_paper_tracker/cloud_run_job/converters/__init__.py new file mode 100644 index 0000000..515d30a --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/__init__.py @@ -0,0 +1,9 @@ +""" +PDF to Markdown converters module. +""" + +from .docling_converter import convert_with_docling +from .pdfplumber_converter import convert_with_pdfplumber +from .openai_converter import convert_with_openai + +__all__ = ["convert_with_docling", "convert_with_pdfplumber", "convert_with_openai"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py new file mode 100644 index 0000000..b113332 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -0,0 +1,62 @@ +""" +Docling-based PDF to Markdown converter. +""" + +from pathlib import Path +from typing import Optional +import logging +logger = logging.getLogger(__name__) + +try: + from docling.document_converter import DocumentConverter # type: ignore[import-untyped] + from docling.datamodel.base_models import InputFormat # type: ignore[import-untyped] + + DOCLING_AVAILABLE = True +except ImportError: + DocumentConverter = None # type: ignore[assignment,misc] + InputFormat = None # type: ignore[assignment,misc] + DOCLING_AVAILABLE = False + logger.warning("Docling not available. Install with: pip install docling") + + +def convert_with_docling(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using Docling. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not DOCLING_AVAILABLE or DocumentConverter is None: + logger.error("Docling is not available") + return None + + try: + logger.info(f"Attempting Docling conversion for: {pdf_path.name}") + + # Initialize converter + converter = DocumentConverter() + + # Convert PDF to document + result = converter.convert(pdf_path) + + # Extract markdown + markdown_content = result.document.export_to_markdown() + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"Docling conversion successful for: {pdf_path.name}") + logger.info(f"Extracted {len(markdown_content)} characters") + return markdown_content + else: + logger.warning( + f"Docling conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"Docling conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py new file mode 100644 index 0000000..211ade7 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -0,0 +1,260 @@ +""" +OpenAI/OpenRouter-based PDF to Markdown converter with OCR. +""" + +import base64 +from pathlib import Path +from typing import Optional +import requests +import logging +logger = logging.getLogger(__name__) + +# Base configuration fallback +import os +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" +OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") + +try: + from pdf2image import convert_from_path + from PIL import Image, ImageOps + import io + + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + logger.warning( + "pdf2image/PIL not available. Install with: pip install pdf2image pillow" + ) + + +def pdf_to_images(pdf_path: Path) -> list[Image.Image]: + """ + Convert PDF pages to images. + + Note: pdf2image should automatically handle PDF rotation metadata, + but we also apply additional rotation correction in correct_image_rotation(). + + Args: + pdf_path: Path to the PDF file. + + Returns: + List of PIL Image objects. + """ + if not PDF2IMAGE_AVAILABLE: + logger.error("pdf2image is not available") + return [] + + try: + logger.info(f"Converting PDF to images: {pdf_path.name}") + # pdf2image should respect PDF rotation, but we'll also check EXIF data + images = convert_from_path(pdf_path, dpi=200) + logger.info(f"Converted {len(images)} pages to images") + return images + except Exception as e: + logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) + return [] + + +def correct_image_rotation(image: Image.Image) -> Image.Image: + """ + Correct image rotation using EXIF data and heuristics. + + Args: + image: PIL Image object. + + Returns: + Corrected PIL Image object. + """ + try: + # First, try to correct using EXIF orientation data + # This handles images that have rotation metadata + corrected_image = ImageOps.exif_transpose(image) + + # If the image was rotated, log it + if corrected_image != image: + logger.debug("Image rotation corrected using EXIF data") + return corrected_image + + # If no EXIF data, check if image might be rotated + # For PDF pages, we can check if width > height suggests landscape + # But we'll keep the original orientation as PDFs can be in any orientation + # The OpenAI vision model can handle rotated text, but it's better to correct it + + return corrected_image + + except Exception as e: + logger.warning(f"Error correcting image rotation: {str(e)}") + return image + + +def image_to_base64(image: Image.Image) -> str: + """ + Convert PIL Image to base64 string. + Automatically corrects rotation before encoding. + + Args: + image: PIL Image object. + + Returns: + Base64 encoded string. + """ + # Correct rotation before encoding + corrected_image = correct_image_rotation(image) + + buffered = io.BytesIO() + corrected_image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode() + return img_str + + +def convert_page_with_openai( + image_base64: str, page_num: int, total_pages: int +) -> Optional[str]: + """ + Convert a single page image to markdown using OpenAI/OpenRouter. + + Args: + image_base64: Base64 encoded image string. + page_num: Current page number. + total_pages: Total number of pages. + + Returns: + Markdown content for the page, or None if conversion fails. + """ + if not OPENROUTER_API_KEY: + logger.error("OpenRouter API key is not set") + return None + + try: + logger.info(f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter") + + url = f"{OPENROUTER_BASE_URL}/chat/completions" + headers = { + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Content-Type": "application/json", + } + + payload = { + "model": OPENROUTER_MODEL, + "messages": [ + { + "role": "system", + "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, + }, + ], + }, + ], + "max_tokens": 4000, + } + + response = requests.post(url, json=payload, headers=headers, timeout=120) + response.raise_for_status() + + result = response.json() + markdown_content = result["choices"][0]["message"]["content"] + + logger.info(f"Successfully converted page {page_num} with OpenAI/OpenRouter") + return markdown_content + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + +def convert_with_openai(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using OpenAI/OpenRouter with OCR. + Processes each page as an image. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not OPENROUTER_API_KEY: + logger.error("OpenRouter API key is not set in environment variables") + return None + + if not PDF2IMAGE_AVAILABLE: + logger.error("pdf2image is required for OpenAI conversion") + return None + + try: + logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") + + # Convert PDF to images + images = pdf_to_images(pdf_path) + if not images: + logger.error(f"Failed to convert PDF to images: {pdf_path.name}") + return None + + total_pages = len(images) + markdown_parts = [] + + # Process each page + for page_num, image in enumerate(images, 1): + try: + # Convert image to base64 + image_base64 = image_to_base64(image) + + # Convert page with OpenAI + page_markdown = convert_page_with_openai( + image_base64, page_num, total_pages + ) + + if page_markdown: + markdown_parts.append(page_markdown) + markdown_parts.append("\n\n") + else: + logger.warning(f"Failed to convert page {page_num} with OpenAI") + markdown_parts.append( + f"## Page {page_num}\n\n*[Conversion failed for this page]*\n\n" + ) + + except Exception as e: + logger.error( + f"Error processing page {page_num}: {str(e)}", exc_info=True + ) + markdown_parts.append( + f"## Page {page_num}\n\n*[Error processing this page: {str(e)}]*\n\n" + ) + continue + + markdown_content = "".join(markdown_parts) + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") + logger.info( + f"Extracted {len(markdown_content)} characters from {total_pages} pages" + ) + return markdown_content + else: + logger.warning( + f"OpenAI/OpenRouter conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for {pdf_path.name}: {str(e)}", + exc_info=True, + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py new file mode 100644 index 0000000..31073d1 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -0,0 +1,92 @@ +""" +PDFPlumber-based PDF to Markdown converter. +""" + +from pathlib import Path +from typing import Optional +import logging +logger = logging.getLogger(__name__) + +try: + import pdfplumber + + PDFPLUMBER_AVAILABLE = True +except ImportError: + PDFPLUMBER_AVAILABLE = False + logger.warning("PDFPlumber not available. Install with: pip install pdfplumber") + + +def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using PDFPlumber. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not PDFPLUMBER_AVAILABLE: + logger.error("PDFPlumber is not available") + return None + + try: + logger.info(f"Attempting PDFPlumber conversion for: {pdf_path.name}") + + markdown_parts = [] + + with pdfplumber.open(pdf_path) as pdf: + total_pages = len(pdf.pages) + logger.info(f"Processing {total_pages} pages with PDFPlumber") + + for page_num, page in enumerate(pdf.pages, 1): + try: + # Extract text from page + text = page.extract_text() + + if text: + markdown_parts.append(text) + markdown_parts.append("\n\n") + + # Extract tables if any + tables = page.extract_tables() + if tables: + for table in tables: + if table: + markdown_parts.append("\n### Table\n\n") + # Convert table to markdown format + for row in table: + if row: + markdown_parts.append( + "| " + + " | ".join( + str(cell) if cell else "" + for cell in row + ) + + " |\n" + ) + markdown_parts.append("\n") + + except Exception as e: + logger.warning( + f"Error processing page {page_num} of {pdf_path.name}: {str(e)}" + ) + continue + + markdown_content = "".join(markdown_parts) + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"PDFPlumber conversion successful for: {pdf_path.name}") + logger.info(f"Extracted {len(markdown_content)} characters") + return markdown_content + else: + logger.warning( + f"PDFPlumber conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py new file mode 100644 index 0000000..952124b --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -0,0 +1,100 @@ +import os +import logging +from pathlib import Path +import tempfile +from google.cloud import storage + +from converters.docling_converter import convert_with_docling +from converters.pdfplumber_converter import convert_with_pdfplumber +from converters.openai_converter import convert_with_openai + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +MIN_CONTENT_LENGTH = 50 + +def is_content_valid(content: str) -> bool: + if not content: + return False + content_stripped = content.strip() + if len(content_stripped) < MIN_CONTENT_LENGTH: + return False + error_patterns = [ + "traceback", "exception:", "error:", "failed to", "unable to convert", "conversion failed", "error processing" + ] + content_lower = content_stripped.lower() + first_part = content_lower[:1000] + for pattern in error_patterns: + if pattern in first_part: + if pattern.startswith("error:") or pattern.startswith("exception:"): + return False + idx = first_part.find(pattern) + if idx < 100: + return False + return True + +def convert_pdf_to_md(pdf_path: Path) -> str: + logger.info("Attempting Docling conversion...") + content = convert_with_docling(pdf_path) + if is_content_valid(content): + return content + + logger.info("Attempting PDFPlumber conversion...") + content = convert_with_pdfplumber(pdf_path) + if is_content_valid(content): + return content + + logger.info("Attempting OpenAI conversion...") + content = convert_with_openai(pdf_path) + if is_content_valid(content): + return content + + return "" + +def main(): + bucket_name = os.getenv("WG21_GCS_BUCKET") + if not bucket_name: + logger.error("WG21_GCS_BUCKET env var not set.") + return + + client = storage.Client() + bucket = client.bucket(bucket_name) + + raw_prefix = "raw/wg21_papers/" + converted_prefix = "converted/wg21_papers/" + + blobs = client.list_blobs(bucket, prefix=raw_prefix) + + with tempfile.TemporaryDirectory() as tmpdir: + for blob in blobs: + if not blob.name.lower().endswith(".pdf"): + continue + + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + relative_path = blob.name[len(raw_prefix):] + md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" + md_blob_name = f"{converted_prefix}{md_relative_path}" + + md_blob = bucket.blob(md_blob_name) + if md_blob.exists(): + logger.info("Skipping %s, MD already exists.", blob.name) + continue + + local_pdf_path = Path(tmpdir) / "temp.pdf" + logger.info("Downloading %s to process...", blob.name) + blob.download_to_filename(str(local_pdf_path)) + + logger.info("Converting %s...", blob.name) + md_content = convert_pdf_to_md(local_pdf_path) + + if md_content: + md_blob.upload_from_string(md_content, content_type="text/markdown") + logger.info("Successfully converted and uploaded %s", md_blob_name) + else: + logger.error("Failed to convert %s", blob.name) + + if local_pdf_path.exists(): + local_pdf_path.unlink() + +if __name__ == "__main__": + main() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt new file mode 100644 index 0000000..0a00731 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -0,0 +1,6 @@ +docling>=1.0.0 +pdfplumber>=0.10.0 +pdf2image>=1.16.0 +Pillow>=10.0.0 +requests>=2.31.0 +google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py new file mode 100644 index 0000000..e254223 --- /dev/null +++ b/wg21_paper_tracker/fetcher.py @@ -0,0 +1,156 @@ +""" +Fetcher for WG21 Papers. +Scrapes the WG21 papers index and specific mailing tables. +""" + +import re +import urllib.parse +from typing import Optional + +import requests +from bs4 import BeautifulSoup +from django.utils.dateparse import parse_date + +import logging +logger = logging.getLogger(__name__) + +BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" + + +def fetch_all_mailings() -> list[dict]: + """ + Fetch the main index and extract all mailings. + Returns a list of dicts: + - mailing_date (e.g. '2025-02') + - title (e.g. '2025-02 pre-Hagenberg mailing') + - year (e.g. '2025') + List is in the order found on the page (usually newest first). + """ + logger.info("Fetching WG21 main index: %s/", BASE_URL) + try: + response = requests.get(f"{BASE_URL}/", timeout=30) + response.raise_for_status() + except Exception as e: + logger.error("Failed to fetch WG21 index: %s", e) + return [] + + # The mailings are listed in a markdown-like syntax or links + # Typically: 2025-02 pre-Hagenberg mailing + # Let's parse with BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + mailings = [] + + # We look for links pointing to year/#mailingYYYY-MM + pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$") + + for a in soup.find_all("a", href=True): + href = a["href"] + match = pattern.search(href) + if match: + year, mailing_date = match.groups() + title = a.text.strip() + mailings.append({ + "mailing_date": mailing_date, + "title": title, + "year": year + }) + + return mailings + + +def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: + """ + Fetch the papers for a specific mailing from the year page. + Returns a list of paper dicts. + """ + url = f"{BASE_URL}/{year}/" + logger.info("Fetching mailing %s from %s", mailing_date, url) + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + except Exception as e: + logger.error("Failed to fetch year page %s: %s", year, e) + return [] + + soup = BeautifulSoup(response.text, "html.parser") + anchor_id = f"mailing{mailing_date}" + anchor = soup.find(id=anchor_id) + if not anchor: + logger.warning("Anchor %s not found on %s", anchor_id, url) + return [] + + table = anchor.find_next("table") + if not table: + logger.warning("No table found after anchor %s", anchor_id) + return [] + + paper_urls = [] + paper_pattern = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) + + for row in table.find_all("tr"): + cells = row.find_all(["td", "th"]) + if not cells or any(cell.get("colspan") for cell in cells): + continue + + # Usually: Number, Title, Author, Date, Subgroup + if len(cells) >= 1: + first_cell = cells[0] + for link in first_cell.find_all("a", href=True): + href = link.get("href", "") + match = paper_pattern.search(href) + if match: + if href.startswith("../"): + paper_url = urllib.parse.urljoin(url, href) + elif href.startswith("/"): + paper_url = urllib.parse.urljoin(BASE_URL, href) + elif not href.startswith("http"): + paper_url = urllib.parse.urljoin(url, href) + else: + paper_url = href + + paper_id = match.group(1).lower() + file_ext = match.group(2).lower() + filename = match.group(0).lower() + + title = "" + if len(cells) > 1: + title = cells[1].text.strip() + + authors = [] + if len(cells) > 2: + authors_raw = cells[2].text.strip() + # Split by comma or 'and' if multiple + if authors_raw: + authors = [a.strip() for a in re.split(r",| and ", authors_raw) if a.strip()] + + document_date = None + if len(cells) > 3: + date_str = cells[3].text.strip() + if date_str: + document_date = date_str # Will be parsed/saved in pipeline + + subgroup = "" + if len(cells) > 4: + subgroup = cells[4].text.strip() + + paper_urls.append({ + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + }) + break # Only take the first paper link in the cell + + # Remove exact duplicates (same filename) + seen = set() + unique_papers = [] + for p in paper_urls: + if p["filename"] not in seen: + seen.add(p["filename"]) + unique_papers.append(p) + + return unique_papers diff --git a/wg21_paper_tracker/management/__init__.py b/wg21_paper_tracker/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/management/commands/__init__.py b/wg21_paper_tracker/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py new file mode 100644 index 0000000..9e06f09 --- /dev/null +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -0,0 +1,56 @@ +""" +Management command for WG21 Paper Tracker. +Runs the pipeline to fetch new mailings, download papers, upload to GCS, and update DB. +If new papers were found and uploaded, it triggers the Google Cloud Run conversion job. +""" + +import logging +import os +from django.core.management.base import BaseCommand +from django.conf import settings + +from wg21_paper_tracker.pipeline import run_tracker_pipeline + +logger = logging.getLogger(__name__) + +def trigger_cloud_run_job(project_id: str, location: str, job_name: str): + from google.cloud import run_v2 + client = run_v2.JobsClient() + name = client.job_path(project_id, location, job_name) + request = run_v2.RunJobRequest(name=name) + logger.info("Triggering Cloud Run job %s...", name) + operation = client.run_job(request=request) + logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) + return operation + +class Command(BaseCommand): + help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + + def handle(self, *args, **options): + logger.info("Starting WG21 Paper Tracker...") + + try: + total_new_papers = run_tracker_pipeline() + self.stdout.write(self.style.SUCCESS(f"Downloaded and uploaded {total_new_papers} new papers.")) + + if total_new_papers > 0: + project_id = settings.GCP_PROJECT_ID + location = settings.GCP_LOCATION + job_name = settings.WG21_CLOUD_RUN_JOB_NAME + + if project_id and job_name: + try: + trigger_cloud_run_job(project_id, location, job_name) + self.stdout.write(self.style.SUCCESS(f"Successfully triggered Cloud Run job {job_name}.")) + except Exception as e: + logger.error("Failed to trigger Cloud Run job: %s", e) + self.stderr.write(self.style.ERROR(f"Error triggering Cloud Run job: {e}")) + else: + logger.warning("GCP_PROJECT_ID not configured. Skipping Cloud Run trigger.") + self.stdout.write(self.style.WARNING("Skipping Cloud Run trigger (missing GCP config).")) + else: + self.stdout.write("No new papers found. Skipping Cloud Run job.") + + except Exception as e: + logger.exception("WG21 Paper Tracker failed: %s", e) + raise diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py new file mode 100644 index 0000000..01e7e58 --- /dev/null +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -0,0 +1,69 @@ +# Generated by Django 4.2.28 on 2026-03-09 15:35 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('cppa_user_tracker', '0005_alter_slackuser_slack_user_id'), + ] + + operations = [ + migrations.CreateModel( + name='WG21Mailing', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('mailing_date', models.CharField(db_index=True, max_length=7, unique=True)), + ('title', models.CharField(max_length=255)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('updated_at', models.DateTimeField(auto_now=True)), + ], + options={ + 'verbose_name': 'WG21 Mailing', + 'verbose_name_plural': 'WG21 Mailings', + 'db_table': 'wg21_paper_tracker_wg21mailing', + 'ordering': ['-mailing_date'], + }, + ), + migrations.CreateModel( + name='WG21Paper', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('paper_id', models.CharField(db_index=True, max_length=255, unique=True)), + ('url', models.URLField(max_length=1024)), + ('title', models.CharField(db_index=True, max_length=1024)), + ('document_date', models.DateField(blank=True, db_index=True, null=True)), + ('subgroup', models.CharField(blank=True, db_index=True, max_length=255)), + ('is_downloaded', models.BooleanField(db_index=True, default=False)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('updated_at', models.DateTimeField(auto_now=True)), + ('mailing', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='wg21_paper_tracker.wg21mailing')), + ], + options={ + 'verbose_name': 'WG21 Paper', + 'verbose_name_plural': 'WG21 Papers', + 'db_table': 'wg21_paper_tracker_wg21paper', + 'ordering': ['-document_date', '-paper_id'], + }, + ), + migrations.CreateModel( + name='WG21PaperAuthor', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('paper', models.ForeignKey(db_column='paper_id', on_delete=django.db.models.deletion.CASCADE, related_name='authors', to='wg21_paper_tracker.wg21paper')), + ('profile', models.ForeignKey(db_column='profile_id', on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='cppa_user_tracker.wg21paperauthorprofile')), + ], + options={ + 'verbose_name': 'WG21 Paper Author', + 'verbose_name_plural': 'WG21 Paper Authors', + 'db_table': 'wg21_paper_tracker_wg21paperauthor', + 'ordering': ['id'], + 'unique_together': {('paper', 'profile')}, + }, + ), + ] diff --git a/wg21_paper_tracker/migrations/__init__.py b/wg21_paper_tracker/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py new file mode 100644 index 0000000..9ae2d27 --- /dev/null +++ b/wg21_paper_tracker/models.py @@ -0,0 +1,76 @@ +""" +Models per docs/Schema.md section 7: WG21 Papers Tracker. +References cppa_user_tracker.WG21PaperAuthorProfile (section 1) as author. +""" + +from django.db import models + + +class WG21Mailing(models.Model): + """WG21 mailing release (mailing_date, title).""" + + mailing_date = models.CharField(max_length=7, unique=True, db_index=True) + title = models.CharField(max_length=255) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-mailing_date"] + verbose_name = "WG21 Mailing" + verbose_name_plural = "WG21 Mailings" + + def __str__(self): + return f"{self.mailing_date} ({self.title})" + + +class WG21Paper(models.Model): + """WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded).""" + + paper_id = models.CharField(max_length=255, unique=True, db_index=True) + url = models.URLField(max_length=1024) + title = models.CharField(max_length=1024, db_index=True) + document_date = models.DateField(db_index=True, null=True, blank=True) + mailing = models.ForeignKey( + WG21Mailing, + on_delete=models.CASCADE, + related_name="papers", + ) + subgroup = models.CharField(max_length=255, blank=True, db_index=True) + is_downloaded = models.BooleanField(default=False, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-document_date", "-paper_id"] + verbose_name = "WG21 Paper" + verbose_name_plural = "WG21 Papers" + + def __str__(self): + return f"{self.paper_id}: {self.title[:60]}" + + +class WG21PaperAuthor(models.Model): + """Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile).""" + + paper = models.ForeignKey( + WG21Paper, + on_delete=models.CASCADE, + related_name="authors", + db_column="paper_id", + ) + profile = models.ForeignKey( + "cppa_user_tracker.WG21PaperAuthorProfile", + on_delete=models.CASCADE, + related_name="papers", + db_column="profile_id", + ) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = (("paper", "profile"),) + ordering = ["id"] + verbose_name = "WG21 Paper Author" + verbose_name_plural = "WG21 Paper Authors" + + def __str__(self): + return f"{self.paper.paper_id} - {self.profile.display_name}" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py new file mode 100644 index 0000000..3c4146c --- /dev/null +++ b/wg21_paper_tracker/pipeline.py @@ -0,0 +1,174 @@ +""" +Pipeline for WG21 Paper Tracker. +Coordinates scraping, downloading, uploading to GCS, and updating the database. +""" + +import os +import requests +import logging +from pathlib import Path +from typing import Optional + +from django.conf import settings +from google.cloud import storage + +from wg21_paper_tracker.fetcher import fetch_all_mailings, fetch_papers_for_mailing +from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.services import get_or_create_mailing, get_or_create_paper +from wg21_paper_tracker.workspace import get_raw_dir + +logger = logging.getLogger(__name__) + +def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: str) -> bool: + """Uploads a file to the bucket.""" + try: + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + blob.upload_from_filename(str(source_path)) + logger.info("Uploaded %s to gs://%s/%s", source_path.name, bucket_name, destination_blob_name) + return True + except Exception as e: + logger.error("Failed to upload to GCS: %s", e) + return False + +def _download_file(url: str, filepath: Path) -> bool: + """Download file from URL to filepath.""" + try: + logger.info("Downloading %s to %s", url, filepath) + response = requests.get(url, timeout=60, stream=True) + response.raise_for_status() + + # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. + content_type = response.headers.get("content-type", "") + if "text" in content_type: + with open(filepath, "w", encoding="utf-8") as f: + f.write(response.content.decode(response.apparent_encoding or "utf-8", errors="replace")) + else: + with open(filepath, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + return True + except Exception as e: + logger.error("Failed to download %s: %s", url, e) + return False + +def run_tracker_pipeline() -> int: + """ + Run the WG21 tracker pipeline. + Returns the number of new papers downloaded and uploaded. + """ + bucket_name = settings.WG21_GCS_BUCKET + if not bucket_name: + logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") + + # 1. Get latest mailing from DB + latest_mailing = WG21Mailing.objects.order_by("-mailing_date").first() + latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01" + + # 2. Fetch all mailings + all_mailings = fetch_all_mailings() + if not all_mailings: + logger.warning("No mailings found on WG21 site.") + return 0 + + # Filter newer mailings + new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] + # Also check the latest one again just in case new papers were added + if latest_mailing and latest_mailing.mailing_date not in [m["mailing_date"] for m in new_mailings]: + # We re-check the most recent mailing from the DB to catch late additions + # Find the matching dict from all_mailings + current_m = next((m for m in all_mailings if m["mailing_date"] == latest_mailing.mailing_date), None) + if current_m: + new_mailings.append(current_m) + + # Sort chronologically (oldest to newest) + new_mailings.sort(key=lambda x: x["mailing_date"]) + + total_new_papers = 0 + + for m_info in new_mailings: + mailing_date = m_info["mailing_date"] + title = m_info["title"] + year = m_info["year"] + + # Create/get mailing in DB + mailing_obj, _ = get_or_create_mailing(mailing_date, title) + + # Fetch papers for this mailing + papers = fetch_papers_for_mailing(year, mailing_date) + if not papers: + continue + + # Group papers by ID to prioritize PDF over HTML + papers_by_id = {} + for p in papers: + pid = p["paper_id"] + if pid not in papers_by_id: + papers_by_id[pid] = [] + papers_by_id[pid].append(p) + + def format_priority(ext: str) -> int: + priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + return priorities.get(ext.lower(), 100) + + raw_dir = get_raw_dir(mailing_date) + + for pid, p_list in papers_by_id.items(): + # Check DB if this paper_id is already fully downloaded + existing_paper = WG21Paper.objects.filter(paper_id=pid).first() + if existing_paper and existing_paper.is_downloaded: + continue + + # Pick the best format + p_list.sort(key=lambda x: format_priority(x["type"])) + best_paper = p_list[0] + + filename = best_paper["filename"] + local_path = raw_dir / filename + url = best_paper["url"] + + # Download + if _download_file(url, local_path): + uploaded = False + if bucket_name: + gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" + uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) + else: + # If no GCS, simulate success so DB is updated + uploaded = True + + # Persist DB + doc_date_str = best_paper["document_date"] + # Parse date if available + from django.utils.dateparse import parse_date + doc_date = None + if doc_date_str: + try: + doc_date = parse_date(doc_date_str) + except: + pass + + paper_obj, created = get_or_create_paper( + paper_id=pid, + url=url, + title=best_paper["title"], + document_date=doc_date, + mailing=mailing_obj, + subgroup=best_paper["subgroup"], + author_names=best_paper["authors"], + ) + + if uploaded: + paper_obj.is_downloaded = True + paper_obj.save(update_fields=["is_downloaded"]) + total_new_papers += 1 + + # Clean up local file to save space + try: + local_path.unlink() + except Exception as e: + logger.warning("Could not delete temp file %s: %s", local_path, e) + + return total_new_papers diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py new file mode 100644 index 0000000..3679d3f --- /dev/null +++ b/wg21_paper_tracker/services.py @@ -0,0 +1,77 @@ +""" +Database logic for WG21 Paper Tracker. +""" + +from typing import Optional + +from django.db import transaction + +from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@transaction.atomic +def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: + mailing, created = WG21Mailing.objects.get_or_create( + mailing_date=mailing_date, + defaults={"title": title} + ) + if not created and mailing.title != title: + mailing.title = title + mailing.save(update_fields=["title", "updated_at"]) + return mailing, created + + +@transaction.atomic +def get_or_create_paper( + paper_id: str, + url: str, + title: str, + document_date: Optional[str], + mailing: WG21Mailing, + subgroup: str = "", + author_names: Optional[list[str]] = None, +) -> tuple[WG21Paper, bool]: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + } + ) + if not created: + updated = False + if paper.url != url: + paper.url = url + updated = True + if paper.title != title: + paper.title = title + updated = True + if paper.document_date != document_date: + paper.document_date = document_date + updated = True + if paper.mailing_id != mailing.id: + paper.mailing = mailing + updated = True + if paper.subgroup != subgroup: + paper.subgroup = subgroup + updated = True + if updated: + paper.save() + + if author_names: + for name in author_names: + profile, _ = get_or_create_wg21_paper_author_profile(name) + WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + ) + + return paper, created + + +def mark_paper_downloaded(paper_id: str): + WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py new file mode 100644 index 0000000..19c0d1b --- /dev/null +++ b/wg21_paper_tracker/workspace.py @@ -0,0 +1,23 @@ +""" +Workspace paths for wg21_paper_tracker. +Temporary file storage during download before uploading to GCS. +""" + +from pathlib import Path + +from config.workspace import get_workspace_path + +_APP_SLUG = "wg21_paper_tracker" +_RAW_APP_SLUG = f"raw/{_APP_SLUG}" + + +def get_workspace_root() -> Path: + return get_workspace_path(_APP_SLUG) + + +def get_raw_dir(mailing_date: str) -> Path: + """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" + raw_root = get_workspace_path(_RAW_APP_SLUG) + path = raw_root / mailing_date + path.mkdir(parents=True, exist_ok=True) + return path diff --git a/workflow/management/commands/run_all_collectors.py b/workflow/management/commands/run_all_collectors.py index 25b262b..c2c1fe4 100644 --- a/workflow/management/commands/run_all_collectors.py +++ b/workflow/management/commands/run_all_collectors.py @@ -20,6 +20,7 @@ "run_boost_usage_tracker", "run_boost_mailing_list_tracker", "run_discord_exporter", + "run_wg21_paper_tracker", ] From 9892a4574adabad90a532853cd7a2bbe08ba0bea Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 19:18:09 -0700 Subject: [PATCH 02/76] wg21_paper_tracker: features, tests, and cleanup #24 --- config/settings.py | 4 +- cppa_user_tracker/services.py | 12 +- parse_test.py | 8 - parse_year.py | 20 -- wg21_paper_tracker/admin.py | 12 +- .../converters/docling_converter.py | 1 + .../converters/openai_converter.py | 5 +- .../converters/pdfplumber_converter.py | 1 + wg21_paper_tracker/cloud_run_job/main.py | 26 +- wg21_paper_tracker/fetcher.py | 49 ++-- .../commands/import_wg21_metadata_from_csv.py | 249 ++++++++++++++++++ .../commands/run_wg21_paper_tracker.py | 53 +++- wg21_paper_tracker/migrations/0001_initial.py | 140 +++++++--- wg21_paper_tracker/models.py | 8 +- wg21_paper_tracker/pipeline.py | 168 +++++++++--- wg21_paper_tracker/services.py | 17 +- wg21_paper_tracker/tests/__init__.py | 1 + wg21_paper_tracker/tests/test_fetcher.py | 179 +++++++++++++ wg21_paper_tracker/tests/test_models.py | 76 ++++++ wg21_paper_tracker/tests/test_pipeline.py | 149 +++++++++++ wg21_paper_tracker/tests/test_services.py | 223 ++++++++++++++++ wg21_paper_tracker/tests/test_workspace.py | 73 +++++ 22 files changed, 1310 insertions(+), 164 deletions(-) delete mode 100644 parse_test.py delete mode 100644 parse_year.py create mode 100644 wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py create mode 100644 wg21_paper_tracker/tests/__init__.py create mode 100644 wg21_paper_tracker/tests/test_fetcher.py create mode 100644 wg21_paper_tracker/tests/test_models.py create mode 100644 wg21_paper_tracker/tests/test_pipeline.py create mode 100644 wg21_paper_tracker/tests/test_services.py create mode 100644 wg21_paper_tracker/tests/test_workspace.py diff --git a/config/settings.py b/config/settings.py index ae4a50f..925ebe5 100644 --- a/config/settings.py +++ b/config/settings.py @@ -220,7 +220,9 @@ WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "").strip() +WG21_CLOUD_RUN_JOB_NAME = ( + env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "" +).strip() # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 35503f4..35b4e31 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -50,9 +50,7 @@ def get_or_create_identity( """Get or create an Identity by display_name. If exists, updates description from defaults.""" lookup = {"display_name": display_name} defaults = defaults or {"description": description} - identity, created = Identity.objects.get_or_create( - defaults=defaults, **lookup - ) + identity, created = Identity.objects.get_or_create(defaults=defaults, **lookup) if ( not created and "description" in defaults @@ -265,9 +263,7 @@ def get_or_create_slack_user( raise ValueError("Slack user ID ('id') is required") profile = user_data.get("profile") or {} username = (user_data.get("name") or "").strip() - display_name = ( - user_data.get("real_name") or user_data.get("name") or "" - ).strip() + display_name = (user_data.get("real_name") or user_data.get("name") or "").strip() avatar_url = (profile.get("image_72") or "").strip() user, created = SlackUser.objects.get_or_create( slack_user_id=user_id, @@ -310,9 +306,7 @@ def get_or_create_unknown_github_account( ).first() if existing is not None: if email_str and not existing.emails.filter(email=email_str).exists(): - add_email( - existing, email_str, is_primary=not existing.emails.exists() - ) + add_email(existing, email_str, is_primary=not existing.emails.exists()) return existing, False next_id = _get_next_negative_github_account_id() account = get_or_create_github_account( diff --git a/parse_test.py b/parse_test.py deleted file mode 100644 index 19dd034..0000000 --- a/parse_test.py +++ /dev/null @@ -1,8 +0,0 @@ -import re -text = """- [2026-01 mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-01) -- [2026-02 pre-Croydon mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-02) -- [2026](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/) N5034-N????""" - -pattern = re.compile(r'\[([^\]]+)\]\([^#]+#mailing(\d{4}-\d{2})\)') -for m in pattern.finditer(text): - print(m.groups()) diff --git a/parse_year.py b/parse_year.py deleted file mode 100644 index 1f6277f..0000000 --- a/parse_year.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -response = requests.get('https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/') -soup = BeautifulSoup(response.text, 'html.parser') - -anchor = soup.find('a', id='mailing2026-02') -if not anchor: - print("Anchor not found") -else: - table = anchor.find_next('table') - rows = table.find_all('tr') - print(f"Found {len(rows)} rows in table after anchor") - for row in rows[:3]: - cells = [c.text.strip() for c in row.find_all(['th', 'td'])] - print(cells) - # Also print links in first cell - if row.find('td'): - links = row.find_all('td')[0].find_all('a') - print("Links:", [l['href'] for l in links]) diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py index a22358d..86784ae 100644 --- a/wg21_paper_tracker/admin.py +++ b/wg21_paper_tracker/admin.py @@ -17,9 +17,17 @@ class WG21PaperAuthorInline(admin.TabularInline): @admin.register(WG21Paper) class WG21PaperAdmin(admin.ModelAdmin): - list_display = ("paper_id", "title", "document_date", "mailing", "subgroup", "is_downloaded") + list_display = ( + "paper_id", + "year", + "title", + "document_date", + "mailing", + "subgroup", + "is_downloaded", + ) search_fields = ("paper_id", "title", "url", "subgroup") - list_filter = ("is_downloaded", "subgroup", "mailing") + list_filter = ("is_downloaded", "subgroup", "mailing", "year") ordering = ("-document_date", "-paper_id") inlines = [WG21PaperAuthorInline] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py index b113332..b9d6067 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional import logging + logger = logging.getLogger(__name__) try: diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 211ade7..cd168aa 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -2,15 +2,18 @@ OpenAI/OpenRouter-based PDF to Markdown converter with OCR. """ +import os import base64 from pathlib import Path from typing import Optional import requests import logging + logger = logging.getLogger(__name__) # Base configuration fallback -import os + + OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 31073d1..58a1465 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional import logging + logger = logging.getLogger(__name__) try: diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index 952124b..cf704ae 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -8,11 +8,14 @@ from converters.pdfplumber_converter import convert_with_pdfplumber from converters.openai_converter import convert_with_openai -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) MIN_CONTENT_LENGTH = 50 + def is_content_valid(content: str) -> bool: if not content: return False @@ -20,7 +23,13 @@ def is_content_valid(content: str) -> bool: if len(content_stripped) < MIN_CONTENT_LENGTH: return False error_patterns = [ - "traceback", "exception:", "error:", "failed to", "unable to convert", "conversion failed", "error processing" + "traceback", + "exception:", + "error:", + "failed to", + "unable to convert", + "conversion failed", + "error processing", ] content_lower = content_stripped.lower() first_part = content_lower[:1000] @@ -33,6 +42,7 @@ def is_content_valid(content: str) -> bool: return False return True + def convert_pdf_to_md(pdf_path: Path) -> str: logger.info("Attempting Docling conversion...") content = convert_with_docling(pdf_path) @@ -48,9 +58,10 @@ def convert_pdf_to_md(pdf_path: Path) -> str: content = convert_with_openai(pdf_path) if is_content_valid(content): return content - + return "" + def main(): bucket_name = os.getenv("WG21_GCS_BUCKET") if not bucket_name: @@ -64,14 +75,14 @@ def main(): converted_prefix = "converted/wg21_papers/" blobs = client.list_blobs(bucket, prefix=raw_prefix) - + with tempfile.TemporaryDirectory() as tmpdir: for blob in blobs: if not blob.name.lower().endswith(".pdf"): continue - + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix):] + relative_path = blob.name[len(raw_prefix) :] md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" md_blob_name = f"{converted_prefix}{md_relative_path}" @@ -92,9 +103,10 @@ def main(): logger.info("Successfully converted and uploaded %s", md_blob_name) else: logger.error("Failed to convert %s", blob.name) - + if local_pdf_path.exists(): local_pdf_path.unlink() + if __name__ == "__main__": main() diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index e254223..e733e83 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -5,13 +5,12 @@ import re import urllib.parse -from typing import Optional import requests from bs4 import BeautifulSoup -from django.utils.dateparse import parse_date import logging + logger = logging.getLogger(__name__) BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" @@ -39,22 +38,20 @@ def fetch_all_mailings() -> list[dict]: # Let's parse with BeautifulSoup soup = BeautifulSoup(response.text, "html.parser") mailings = [] - + # We look for links pointing to year/#mailingYYYY-MM pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$") - + for a in soup.find_all("a", href=True): href = a["href"] match = pattern.search(href) if match: year, mailing_date = match.groups() title = a.text.strip() - mailings.append({ - "mailing_date": mailing_date, - "title": title, - "year": year - }) - + mailings.append( + {"mailing_date": mailing_date, "title": title, "year": year} + ) + return mailings @@ -74,7 +71,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: soup = BeautifulSoup(response.text, "html.parser") anchor_id = f"mailing{mailing_date}" - anchor = soup.find(id=anchor_id) + anchor = soup.find(id=anchor_id) or soup.find(attrs={"name": anchor_id}) if not anchor: logger.warning("Anchor %s not found on %s", anchor_id, url) return [] @@ -115,13 +112,17 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: title = "" if len(cells) > 1: title = cells[1].text.strip() - + authors = [] if len(cells) > 2: authors_raw = cells[2].text.strip() # Split by comma or 'and' if multiple if authors_raw: - authors = [a.strip() for a in re.split(r",| and ", authors_raw) if a.strip()] + authors = [ + a.strip() + for a in re.split(r",| and ", authors_raw) + if a.strip() + ] document_date = None if len(cells) > 3: @@ -133,16 +134,18 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: if len(cells) > 4: subgroup = cells[4].text.strip() - paper_urls.append({ - "url": paper_url, - "filename": filename, - "type": file_ext, - "paper_id": paper_id, - "title": title, - "authors": authors, - "document_date": document_date, - "subgroup": subgroup, - }) + paper_urls.append( + { + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + } + ) break # Only take the first paper link in the cell # Remove exact duplicates (same filename) diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py new file mode 100644 index 0000000..5d4a398 --- /dev/null +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -0,0 +1,249 @@ +""" +Management command: import_wg21_metadata_from_csv + +Reads workspace/wg21_paper_tracker/metadata.csv (or a given path) and fills +WG21Mailing, WG21Paper, and WG21PaperAuthor using get_or_create_mailing and +get_or_create_paper. Handles missing mailing_date via a placeholder mailing +(unknown / Unknown). +""" + +import csv +import logging +import re +from pathlib import Path + +from django.core.management.base import BaseCommand +from django.db import IntegrityError +from django.utils.dateparse import parse_date + +from wg21_paper_tracker.models import WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, +) +from wg21_paper_tracker.workspace import get_workspace_root + +logger = logging.getLogger(__name__) + +MAILING_DATE_PATTERN = re.compile(r"^\d{4}-\d{2}$") +TITLE_MAX_LENGTH = 1024 +PLACEHOLDER_MAILING_DATE = "unknown" +PLACEHOLDER_MAILING_TITLE = "Unknown" + + +def _norm(s: str) -> str: + """Return the string stripped of leading/trailing whitespace, or empty string if None.""" + return (s or "").strip() + + +def _normalize_title(raw: str) -> str: + """Replace internal newlines with space and truncate to model max_length.""" + if not raw: + return "" + one_line = " ".join(raw.split()) + return ( + one_line[:TITLE_MAX_LENGTH] + if len(one_line) > TITLE_MAX_LENGTH + else one_line + ) + + +def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]: + """ + Return (mailing_date, title) for this row. + If CSV mailing_date is non-empty and YYYY-MM, use it with synthetic title. + Otherwise use placeholder mailing_date="unknown", title="Unknown". + """ + cleaned = _norm(csv_mailing_date) + if cleaned and MAILING_DATE_PATTERN.match(cleaned): + return cleaned, f"{cleaned} (from metadata)" + return PLACEHOLDER_MAILING_DATE, PLACEHOLDER_MAILING_TITLE + + +def _parse_document_date(date_str: str): + """Return date or None from CSV date column (e.g. YYYY-MM-DD). Invalid values return None.""" + cleaned = _norm(date_str) + if not cleaned: + return None + # try: + return parse_date(cleaned) + # except (ValueError, TypeError): + # return None + + +def _author_names_from_csv(author_str: str) -> list[str]: + """Split author column by comma, strip each, drop empty.""" + cleaned = _norm(author_str) + if not cleaned: + return [] + return [a.strip() for a in cleaned.split(",") if a.strip()] + + +def _read_csv_rows(csv_path: Path): + """Yield dicts for each row with normalized keys and values.""" + with open(csv_path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + out = {} + for k, v in row.items(): + if k is None: + continue + key = k.strip().lower() + out[key] = _norm(v) if v is not None else "" + # Normalize title (multi-line -> single line, truncate) + if "title" in out: + out["title"] = _normalize_title(out["title"]) + yield out + + +class Command(BaseCommand): + help = ( + "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). " + "CSV columns: filename, paper_id, url, title, author, date, mailing_date, subgroup. " + "When mailing_date is empty, papers are linked to a single 'unknown' mailing." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--csv-file", + type=Path, + default=None, + help="Path to metadata CSV (default: workspace/wg21_paper_tracker/metadata.csv)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only read CSV and report what would be done; do not write to DB.", + ) + + def handle(self, *args, **options): + csv_path = options.get("csv_file") or ( + get_workspace_root() / "metadata.csv" + ) + dry_run = options["dry_run"] + + if not csv_path.exists(): + logger.error("File not found: %s", csv_path) + return + + if dry_run: + logger.info("Dry run: no DB writes.") + + stats = { + "rows": 0, + "skipped": 0, + "mailings_created": 0, + "papers_created": 0, + "papers_updated": 0, + } + + for row in _read_csv_rows(csv_path): + stats["rows"] += 1 + paper_id = (row.get("paper_id", "") or "").strip().lower() + url = row.get("url", "") + document_date = row.get("date", "") + + if not paper_id or not url: + stats["skipped"] += 1 + if stats["skipped"] <= 5: + logger.debug( + "Skipping row: missing paper_id or url: %s", + row.get("paper_id", "") or row.get("url", "")[:50], + ) + continue + + mailing_date, mailing_title = _resolve_mailing_date( + row.get("mailing_date", "") + ) + year_str = ( + mailing_date[:4] + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date) + else (document_date[:4] if document_date else None) + ) + year = int(year_str) if year_str and year_str.isdigit() else None + try: + document_date = _parse_document_date(row.get("date", "")) + title = row.get("title", "") or paper_id + subgroup = row.get("subgroup", "") + author_names = _author_names_from_csv(row.get("author", "")) + except Exception as e: + stats["skipped"] += 1 + logger.error( + "Error parsing document date for paper_id=%s: %s", + paper_id, + e, + ) + continue + + if dry_run: + logger.info( + "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", + paper_id, + mailing_date, + document_date, + len(author_names), + ) + continue + + try: + mailing, mailing_created = get_or_create_mailing( + mailing_date, mailing_title + ) + if mailing_created: + stats["mailings_created"] += 1 + + paper, paper_created = get_or_create_paper( + paper_id=paper_id, + url=url, + title=title, + document_date=document_date, + mailing=mailing, + subgroup=subgroup, + author_names=author_names if author_names else None, + year=year, + ) + if paper_created: + stats["papers_created"] += 1 + else: + stats["papers_updated"] += 1 + except IntegrityError as e: + # Duplicate (paper_id) or (paper_id, year): fetch existing and update + stats["papers_updated"] += 1 + try: + paper = WG21Paper.objects.filter(paper_id=paper_id).first() + if paper is None: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", paper_id, e) + else: + paper.url = url + paper.title = title + paper.document_date = document_date + paper.mailing = mailing + paper.subgroup = subgroup + if year is not None: + paper.year = year + paper.save() + if author_names: + from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + for name in author_names: + profile, _ = get_or_create_wg21_paper_author_profile(name) + WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + ) + except Exception as inner: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s (after IntegrityError): %s", paper_id, inner) + except Exception as e: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", paper_id, e) + + logger.info( + "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d", + stats["rows"], + stats["skipped"], + stats["mailings_created"], + stats["papers_created"], + stats["papers_updated"], + ) + logger.info("Done.") diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index 9e06f09..f771043 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -5,7 +5,6 @@ """ import logging -import os from django.core.management.base import BaseCommand from django.conf import settings @@ -13,8 +12,17 @@ logger = logging.getLogger(__name__) + def trigger_cloud_run_job(project_id: str, location: str, job_name: str): + """ + Start the named Cloud Run job (run once, no polling). + + Uses the Cloud Run v2 API to trigger the job identified by project_id, + location, and job_name. The job runs asynchronously; this function returns + the operation and does not wait for the job to finish. + """ from google.cloud import run_v2 + client = run_v2.JobsClient() name = client.job_path(project_id, location, job_name) request = run_v2.RunJobRequest(name=name) @@ -23,16 +31,39 @@ def trigger_cloud_run_job(project_id: str, location: str, job_name: str): logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) return operation + class Command(BaseCommand): + """Run WG21 paper tracker and optionally trigger the Cloud Run conversion job.""" + help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + def add_arguments(self, parser): + """Register --dry-run so the command can skip pipeline and Cloud Run.""" + parser.add_argument( + "--dry-run", + action="store_true", + help="Only log what would be done; do not run the pipeline or trigger Cloud Run.", + ) + def handle(self, *args, **options): + """ + Run the tracker pipeline; if new papers were uploaded, trigger the Cloud Run job. + + With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. + Otherwise runs the pipeline, then triggers the configured Cloud Run job when + total_new_papers > 0 and GCP_PROJECT_ID and WG21_CLOUD_RUN_JOB_NAME are set. + """ + dry_run = options.get("dry_run", False) + if dry_run: + logger.info("Dry run: skipping pipeline and Cloud Run trigger.") + return + logger.info("Starting WG21 Paper Tracker...") - + try: total_new_papers = run_tracker_pipeline() - self.stdout.write(self.style.SUCCESS(f"Downloaded and uploaded {total_new_papers} new papers.")) - + logger.info("Downloaded and uploaded %d new papers.", total_new_papers) + if total_new_papers > 0: project_id = settings.GCP_PROJECT_ID location = settings.GCP_LOCATION @@ -41,16 +72,18 @@ def handle(self, *args, **options): if project_id and job_name: try: trigger_cloud_run_job(project_id, location, job_name) - self.stdout.write(self.style.SUCCESS(f"Successfully triggered Cloud Run job {job_name}.")) + logger.info( + "Successfully triggered Cloud Run job %s.", job_name + ) except Exception as e: logger.error("Failed to trigger Cloud Run job: %s", e) - self.stderr.write(self.style.ERROR(f"Error triggering Cloud Run job: {e}")) else: - logger.warning("GCP_PROJECT_ID not configured. Skipping Cloud Run trigger.") - self.stdout.write(self.style.WARNING("Skipping Cloud Run trigger (missing GCP config).")) + logger.warning( + "GCP_PROJECT_ID not configured. Skipping Cloud Run trigger." + ) else: - self.stdout.write("No new papers found. Skipping Cloud Run job.") - + logger.info("No new papers found. Skipping Cloud Run job.") + except Exception as e: logger.exception("WG21 Paper Tracker failed: %s", e) raise diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index 01e7e58..b4f9635 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.28 on 2026-03-09 15:35 +# Merged initial migration: WG21 Mailing, WG21 Paper (with year), WG21 Paper Author from django.db import migrations, models import django.db.models.deletion @@ -9,61 +9,127 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ('cppa_user_tracker', '0005_alter_slackuser_slack_user_id'), + ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), ] operations = [ migrations.CreateModel( - name='WG21Mailing', + name="WG21Mailing", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('mailing_date', models.CharField(db_index=True, max_length=7, unique=True)), - ('title', models.CharField(max_length=255)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "mailing_date", + models.CharField(db_index=True, max_length=7, unique=True), + ), + ("title", models.CharField(max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), ], options={ - 'verbose_name': 'WG21 Mailing', - 'verbose_name_plural': 'WG21 Mailings', - 'db_table': 'wg21_paper_tracker_wg21mailing', - 'ordering': ['-mailing_date'], + "verbose_name": "WG21 Mailing", + "verbose_name_plural": "WG21 Mailings", + "db_table": "wg21_paper_tracker_wg21mailing", + "ordering": ["-mailing_date"], }, ), migrations.CreateModel( - name='WG21Paper', + name="WG21Paper", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('paper_id', models.CharField(db_index=True, max_length=255, unique=True)), - ('url', models.URLField(max_length=1024)), - ('title', models.CharField(db_index=True, max_length=1024)), - ('document_date', models.DateField(blank=True, db_index=True, null=True)), - ('subgroup', models.CharField(blank=True, db_index=True, max_length=255)), - ('is_downloaded', models.BooleanField(db_index=True, default=False)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ('mailing', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='wg21_paper_tracker.wg21mailing')), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("paper_id", models.CharField(db_index=True, max_length=255)), + ("url", models.URLField(max_length=1024)), + ("title", models.CharField(db_index=True, max_length=1024)), + ( + "document_date", + models.DateField(blank=True, db_index=True, null=True), + ), + ( + "year", + models.IntegerField(blank=True, db_index=True, null=True), + ), + ( + "subgroup", + models.CharField( + blank=True, db_index=True, max_length=255 + ), + ), + ( + "is_downloaded", + models.BooleanField(db_index=True, default=False), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "mailing", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="wg21_paper_tracker.wg21mailing", + ), + ), ], options={ - 'verbose_name': 'WG21 Paper', - 'verbose_name_plural': 'WG21 Papers', - 'db_table': 'wg21_paper_tracker_wg21paper', - 'ordering': ['-document_date', '-paper_id'], + "verbose_name": "WG21 Paper", + "verbose_name_plural": "WG21 Papers", + "db_table": "wg21_paper_tracker_wg21paper", + "ordering": ["-document_date", "-paper_id", "-year"], + "unique_together": {("paper_id", "year")}, }, ), migrations.CreateModel( - name='WG21PaperAuthor', + name="WG21PaperAuthor", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('paper', models.ForeignKey(db_column='paper_id', on_delete=django.db.models.deletion.CASCADE, related_name='authors', to='wg21_paper_tracker.wg21paper')), - ('profile', models.ForeignKey(db_column='profile_id', on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='cppa_user_tracker.wg21paperauthorprofile')), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "paper", + models.ForeignKey( + db_column="paper_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="authors", + to="wg21_paper_tracker.wg21paper", + ), + ), + ( + "profile", + models.ForeignKey( + db_column="profile_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="cppa_user_tracker.wg21paperauthorprofile", + ), + ), ], options={ - 'verbose_name': 'WG21 Paper Author', - 'verbose_name_plural': 'WG21 Paper Authors', - 'db_table': 'wg21_paper_tracker_wg21paperauthor', - 'ordering': ['id'], - 'unique_together': {('paper', 'profile')}, + "verbose_name": "WG21 Paper Author", + "verbose_name_plural": "WG21 Paper Authors", + "db_table": "wg21_paper_tracker_wg21paperauthor", + "ordering": ["id"], + "unique_together": {("paper", "profile")}, }, ), ] diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py index 9ae2d27..44754b4 100644 --- a/wg21_paper_tracker/models.py +++ b/wg21_paper_tracker/models.py @@ -24,12 +24,13 @@ def __str__(self): class WG21Paper(models.Model): - """WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded).""" + """WG21 paper (paper_id, url, title, document_date, year, mailing, subgroup, is_downloaded).""" - paper_id = models.CharField(max_length=255, unique=True, db_index=True) + paper_id = models.CharField(max_length=255, db_index=True) url = models.URLField(max_length=1024) title = models.CharField(max_length=1024, db_index=True) document_date = models.DateField(db_index=True, null=True, blank=True) + year = models.IntegerField(null=True, blank=True, db_index=True) mailing = models.ForeignKey( WG21Mailing, on_delete=models.CASCADE, @@ -41,7 +42,8 @@ class WG21Paper(models.Model): updated_at = models.DateTimeField(auto_now=True) class Meta: - ordering = ["-document_date", "-paper_id"] + unique_together = (("paper_id", "year"),) + ordering = ["-document_date", "-paper_id", "-year"] verbose_name = "WG21 Paper" verbose_name_plural = "WG21 Papers" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 3c4146c..edcf003 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -3,23 +3,35 @@ Coordinates scraping, downloading, uploading to GCS, and updating the database. """ -import os +import time import requests import logging from pathlib import Path -from typing import Optional from django.conf import settings from google.cloud import storage -from wg21_paper_tracker.fetcher import fetch_all_mailings, fetch_papers_for_mailing +from wg21_paper_tracker.fetcher import ( + fetch_all_mailings, + fetch_papers_for_mailing, +) from wg21_paper_tracker.models import WG21Mailing, WG21Paper -from wg21_paper_tracker.services import get_or_create_mailing, get_or_create_paper +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, +) from wg21_paper_tracker.workspace import get_raw_dir logger = logging.getLogger(__name__) -def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: str) -> bool: +DOWNLOAD_TIMEOUT = 30 +DOWNLOAD_MAX_RETRIES = 3 +DOWNLOAD_RETRY_DELAY = 2 + + +def _upload_to_gcs( + bucket_name: str, source_path: Path, destination_blob_name: str +) -> bool: """Uploads a file to the bucket.""" try: storage_client = storage.Client() @@ -27,32 +39,67 @@ def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: s blob = bucket.blob(destination_blob_name) blob.upload_from_filename(str(source_path)) - logger.info("Uploaded %s to gs://%s/%s", source_path.name, bucket_name, destination_blob_name) + logger.info( + "Uploaded %s to gs://%s/%s", + source_path.name, + bucket_name, + destination_blob_name, + ) return True except Exception as e: logger.error("Failed to upload to GCS: %s", e) return False + def _download_file(url: str, filepath: Path) -> bool: - """Download file from URL to filepath.""" - try: - logger.info("Downloading %s to %s", url, filepath) - response = requests.get(url, timeout=60, stream=True) - response.raise_for_status() - - # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. - content_type = response.headers.get("content-type", "") - if "text" in content_type: - with open(filepath, "w", encoding="utf-8") as f: - f.write(response.content.decode(response.apparent_encoding or "utf-8", errors="replace")) - else: - with open(filepath, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - return True - except Exception as e: - logger.error("Failed to download %s: %s", url, e) - return False + """Download file from URL to filepath with retries and 30s timeout.""" + for attempt in range(1, DOWNLOAD_MAX_RETRIES + 1): + try: + logger.info( + "Downloading %s to %s (attempt %d/%d)", + url, + filepath, + attempt, + DOWNLOAD_MAX_RETRIES, + ) + response = requests.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True) + response.raise_for_status() + + # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. + content_type = response.headers.get("content-type", "") + if "text" in content_type: + with open(filepath, "w", encoding="utf-8") as f: + f.write( + response.content.decode( + response.apparent_encoding or "utf-8", + errors="replace", + ) + ) + else: + with open(filepath, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + return True + except Exception as e: + if attempt < DOWNLOAD_MAX_RETRIES: + logger.warning( + "Download attempt %d/%d failed for %s: %s. Retrying in %ds.", + attempt, + DOWNLOAD_MAX_RETRIES, + url, + e, + DOWNLOAD_RETRY_DELAY, + ) + time.sleep(DOWNLOAD_RETRY_DELAY) + else: + logger.error( + "Failed to download %s after %d attempts: %s", + url, + DOWNLOAD_MAX_RETRIES, + e, + ) + return False + def run_tracker_pipeline() -> int: """ @@ -64,7 +111,11 @@ def run_tracker_pipeline() -> int: logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") # 1. Get latest mailing from DB - latest_mailing = WG21Mailing.objects.order_by("-mailing_date").first() + latest_mailing = ( + WG21Mailing.objects.exclude(mailing_date="unknown") + .order_by("-mailing_date") + .first() + ) latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01" # 2. Fetch all mailings @@ -76,22 +127,37 @@ def run_tracker_pipeline() -> int: # Filter newer mailings new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] # Also check the latest one again just in case new papers were added - if latest_mailing and latest_mailing.mailing_date not in [m["mailing_date"] for m in new_mailings]: + if latest_mailing and latest_mailing.mailing_date not in [ + m["mailing_date"] for m in new_mailings + ]: # We re-check the most recent mailing from the DB to catch late additions # Find the matching dict from all_mailings - current_m = next((m for m in all_mailings if m["mailing_date"] == latest_mailing.mailing_date), None) + current_m = next( + ( + m + for m in all_mailings + if m["mailing_date"] == latest_mailing.mailing_date + ), + None, + ) if current_m: new_mailings.append(current_m) # Sort chronologically (oldest to newest) new_mailings.sort(key=lambda x: x["mailing_date"]) + logger.info( + "Pipeline: latest_date=%s, all_mailings=%d, mailings_to_process=%s", + latest_date, + len(all_mailings), + [m["mailing_date"] for m in new_mailings], + ) total_new_papers = 0 for m_info in new_mailings: mailing_date = m_info["mailing_date"] title = m_info["title"] - year = m_info["year"] + year = int(m_info["year"]) if m_info["year"] else None # Create/get mailing in DB mailing_obj, _ = get_or_create_mailing(mailing_date, title) @@ -99,12 +165,16 @@ def run_tracker_pipeline() -> int: # Fetch papers for this mailing papers = fetch_papers_for_mailing(year, mailing_date) if not papers: + logger.info( + "Mailing %s: no papers found (anchor/table may be missing).", + mailing_date, + ) continue - # Group papers by ID to prioritize PDF over HTML + # Group papers by ID to prioritize PDF over HTML (paper_id is case-insensitive) papers_by_id = {} for p in papers: - pid = p["paper_id"] + pid = (p["paper_id"] or "").strip().lower() if pid not in papers_by_id: papers_by_id[pid] = [] papers_by_id[pid].append(p) @@ -115,10 +185,12 @@ def format_priority(ext: str) -> int: raw_dir = get_raw_dir(mailing_date) + skipped_downloaded = 0 for pid, p_list in papers_by_id.items(): # Check DB if this paper_id is already fully downloaded existing_paper = WG21Paper.objects.filter(paper_id=pid).first() if existing_paper and existing_paper.is_downloaded: + skipped_downloaded += 1 continue # Pick the best format @@ -138,19 +210,25 @@ def format_priority(ext: str) -> int: else: # If no GCS, simulate success so DB is updated uploaded = True - + # Persist DB doc_date_str = best_paper["document_date"] # Parse date if available from django.utils.dateparse import parse_date + doc_date = None if doc_date_str: try: doc_date = parse_date(doc_date_str) - except: - pass + except Exception as e: + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + doc_date = None - paper_obj, created = get_or_create_paper( + paper_obj, _created = get_or_create_paper( paper_id=pid, url=url, title=best_paper["title"], @@ -158,17 +236,27 @@ def format_priority(ext: str) -> int: mailing=mailing_obj, subgroup=best_paper["subgroup"], author_names=best_paper["authors"], + year=year, ) - + if uploaded: paper_obj.is_downloaded = True paper_obj.save(update_fields=["is_downloaded"]) total_new_papers += 1 # Clean up local file to save space - try: - local_path.unlink() - except Exception as e: - logger.warning("Could not delete temp file %s: %s", local_path, e) + # try: + # # local_path.unlink() + # except Exception as e: + # logger.warning( + # "Could not delete temp file %s: %s", local_path, e + # ) + + if skipped_downloaded: + logger.info( + "Mailing %s: skipped %d papers (already downloaded).", + mailing_date, + skipped_downloaded, + ) return total_new_papers diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 3679d3f..cf846b0 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -13,8 +13,7 @@ @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: mailing, created = WG21Mailing.objects.get_or_create( - mailing_date=mailing_date, - defaults={"title": title} + mailing_date=mailing_date, defaults={"title": title} ) if not created and mailing.title != title: mailing.title = title @@ -31,16 +30,24 @@ def get_or_create_paper( mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, + year: int | None = None, ) -> tuple[WG21Paper, bool]: + paper_id = (paper_id or "").strip().lower() + year_val = None + if year: + s = (year if isinstance(year, str) else str(year)).strip()[:4] + if s.isdigit(): + year_val = int(s) paper, created = WG21Paper.objects.get_or_create( paper_id=paper_id, + year=year_val, defaults={ "url": url, "title": title, "document_date": document_date, "mailing": mailing, "subgroup": subgroup, - } + }, ) if not created: updated = False @@ -59,6 +66,9 @@ def get_or_create_paper( if paper.subgroup != subgroup: paper.subgroup = subgroup updated = True + if year_val is not None and paper.year != year_val: + paper.year = year_val + updated = True if updated: paper.save() @@ -74,4 +84,5 @@ def get_or_create_paper( def mark_paper_downloaded(paper_id: str): + paper_id = (paper_id or "").strip().lower() WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) diff --git a/wg21_paper_tracker/tests/__init__.py b/wg21_paper_tracker/tests/__init__.py new file mode 100644 index 0000000..18e481d --- /dev/null +++ b/wg21_paper_tracker/tests/__init__.py @@ -0,0 +1 @@ +# Tests for wg21_paper_tracker app (excluding cloud_run_job). diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py new file mode 100644 index 0000000..a06317a --- /dev/null +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -0,0 +1,179 @@ +"""Tests for wg21_paper_tracker.fetcher.""" + +from unittest.mock import patch, MagicMock + +import pytest + +from wg21_paper_tracker.fetcher import ( + BASE_URL, + fetch_all_mailings, + fetch_papers_for_mailing, +) + + +# --- fetch_all_mailings --- + + +def test_fetch_all_mailings_returns_empty_on_request_failure(): + """fetch_all_mailings returns [] when requests.get raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = Exception("network error") + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_returns_empty_on_http_error(): + """fetch_all_mailings returns [] when response.raise_for_status raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.raise_for_status.side_effect = Exception("404") + m.return_value = resp + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_parses_links(): + """fetch_all_mailings parses year/#mailingYYYY-MM links and returns mailings.""" + html = """ + + 2025-01 pre-meeting mailing + 2025-02 post-meeting mailing + 2024-11 mailing + Ignore + + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_all_mailings() + assert len(result) == 3 + assert result[0]["mailing_date"] == "2025-01" + assert result[0]["title"] == "2025-01 pre-meeting mailing" + assert result[0]["year"] == "2025" + assert result[1]["mailing_date"] == "2025-02" + assert result[2]["mailing_date"] == "2024-11" + assert result[2]["year"] == "2024" + + +def test_fetch_all_mailings_calls_index_url(): + """fetch_all_mailings calls BASE_URL/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + fetch_all_mailings() + m.assert_called_once_with(f"{BASE_URL}/", timeout=30) + + +# --- fetch_papers_for_mailing --- + + +def test_fetch_papers_for_mailing_returns_empty_on_request_failure(): + """fetch_papers_for_mailing returns [] when requests.get raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = Exception("timeout") + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_returns_empty_when_anchor_missing(): + """fetch_papers_for_mailing returns [] when mailing anchor is not found.""" + html = "
x
" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_finds_anchor_by_id(): + """fetch_papers_for_mailing finds anchor by id=mailingYYYY-MM.""" + html = """ + + + + +
p1000r0.pdfTitleAuthor2025-01-15SG1
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "p1000r0" + assert result[0]["filename"] == "p1000r0.pdf" + assert result[0]["title"] == "Title" + assert result[0]["authors"] == ["Author"] + assert result[0]["document_date"] == "2025-01-15" + assert result[0]["subgroup"] == "SG1" + + +def test_fetch_papers_for_mailing_finds_anchor_by_name(): + """fetch_papers_for_mailing finds anchor by name= when id is missing.""" + html = """ + + + + +
n5034.htmlDraft
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "n5034" + assert result[0]["type"] == "html" + + +def test_fetch_papers_for_mailing_normalizes_paper_id_lowercase(): + """fetch_papers_for_mailing returns paper_id in lowercase.""" + html = """ + + + + +
P3039R1.PDF
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result[0]["paper_id"] == "p3039r1" + assert result[0]["filename"] == "p3039r1.pdf" + + +def test_fetch_papers_for_mailing_returns_empty_when_no_table(): + """fetch_papers_for_mailing returns [] when no table follows anchor.""" + html = """ + + +

No table here

+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_calls_year_url(): + """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + fetch_papers_for_mailing("2025", "2025-01") + m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py new file mode 100644 index 0000000..ca92819 --- /dev/null +++ b/wg21_paper_tracker/tests/test_models.py @@ -0,0 +1,76 @@ +"""Tests for wg21_paper_tracker.models.""" + +from datetime import date + +import pytest + +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@pytest.mark.django_db +def test_wg21_mailing_str(): + """WG21Mailing.__str__ returns mailing_date and title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="2025-01 pre-meeting") + assert str(m) == "2025-01 (2025-01 pre-meeting)" + + +@pytest.mark.django_db +def test_wg21_paper_str(): + """WG21Paper.__str__ returns paper_id and truncated title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + p = WG21Paper.objects.create( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="A short title", + document_date=date(2025, 1, 15), + mailing=m, + year=2025, + ) + assert "p1000r0" in str(p) + assert "A short title" in str(p) + + +@pytest.mark.django_db +def test_wg21_paper_str_truncates_long_title(): + """WG21Paper.__str__ truncates title to 60 chars.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + long_title = "x" * 100 + p = WG21Paper.objects.create( + paper_id="p1", + url="https://example.com/p.pdf", + title=long_title, + mailing=m, + year=2025, + ) + assert len(str(p).split(": ", 1)[-1]) <= 60 + + +@pytest.mark.django_db +def test_wg21_mailing_ordering(): + """WG21Mailing default ordering is by mailing_date descending.""" + WG21Mailing.objects.create(mailing_date="2025-01", title="A") + WG21Mailing.objects.create(mailing_date="2025-02", title="B") + dates = list(WG21Mailing.objects.values_list("mailing_date", flat=True)) + assert dates == ["2025-02", "2025-01"] + + +@pytest.mark.django_db +def test_wg21_paper_unique_together_paper_id_year(): + """WG21Paper allows same paper_id with different year.""" + m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1") + m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2") + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/1.pdf", + title="T1", + mailing=m1, + year=2024, + ) + p2 = WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/2.pdf", + title="T2", + mailing=m2, + year=2025, + ) + assert p2.pk is not None diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py new file mode 100644 index 0000000..e052ce9 --- /dev/null +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -0,0 +1,149 @@ +"""Tests for wg21_paper_tracker.pipeline.""" + +import time +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +from wg21_paper_tracker.pipeline import ( + DOWNLOAD_TIMEOUT, + DOWNLOAD_MAX_RETRIES, + _download_file, + run_tracker_pipeline, +) + + +# --- _download_file --- + + +def test_download_file_success_text(tmp_path): + """_download_file saves text response and returns True.""" + url = "https://example.com/doc.html" + filepath = tmp_path / "doc.html" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/html; charset=utf-8"} + resp.content = b"Hello" + resp.apparent_encoding = "utf-8" + resp.iter_content = None + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): + result = _download_file(url, filepath) + assert result is True + assert filepath.read_text(encoding="utf-8") == "Hello" + + +def test_download_file_success_binary(tmp_path): + """_download_file saves binary response and returns True.""" + url = "https://example.com/doc.pdf" + filepath = tmp_path / "doc.pdf" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "application/pdf"} + resp.iter_content = lambda chunk_size: (b"\x25\x50\x44\x46",) + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): + result = _download_file(url, filepath) + assert result is True + assert filepath.read_bytes() == b"\x25\x50\x44\x46" + + +def test_download_file_uses_timeout(): + """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" + url = "https://example.com/f" + filepath = Path("/tmp/out") + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/plain"} + resp.content = b"x" + resp.apparent_encoding = "utf-8" + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp) as m: + _download_file(url, filepath) + m.assert_called_once() + assert m.call_args[1]["timeout"] == DOWNLOAD_TIMEOUT + + +def test_download_file_retries_on_failure(tmp_path): + """_download_file retries up to DOWNLOAD_MAX_RETRIES then returns False.""" + url = "https://example.com/f" + filepath = tmp_path / "f" + with patch("wg21_paper_tracker.pipeline.requests.get") as m: + m.side_effect = Exception("connection error") + with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: + result = _download_file(url, filepath) + assert result is False + assert m.call_count == DOWNLOAD_MAX_RETRIES + assert sleep_mock.call_count == DOWNLOAD_MAX_RETRIES - 1 + + +def test_download_file_succeeds_on_second_attempt(tmp_path): + """_download_file succeeds when a retry succeeds.""" + url = "https://example.com/f" + filepath = tmp_path / "f" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/plain"} + resp.content = b"ok" + resp.apparent_encoding = "utf-8" + with patch("wg21_paper_tracker.pipeline.requests.get") as m: + m.side_effect = [Exception("first fail"), resp] + with patch("wg21_paper_tracker.pipeline.time.sleep"): + result = _download_file(url, filepath) + assert result is True + assert m.call_count == 2 + assert filepath.read_text() == "ok" + + +# --- run_tracker_pipeline --- + + +@pytest.mark.django_db +def test_run_tracker_pipeline_returns_zero_when_no_mailings(): + """run_tracker_pipeline returns 0 when fetch_all_mailings returns [].""" + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]): + n = run_tracker_pipeline() + assert n == 0 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_skips_when_no_new_mailings(): + """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" + from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m: + m.return_value = [ + {"mailing_date": "2025-01", "title": "Old", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest", "year": "2025"}, + ] + with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[]): + n = run_tracker_pipeline() + assert n == 0 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_downloads_new_papers(tmp_path): + """run_tracker_pipeline downloads papers for new mailings and returns count.""" + from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-01", "title": "Previous", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers): + with patch("wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path): + with patch("wg21_paper_tracker.pipeline._download_file", return_value=True): + with patch("wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None): + n = run_tracker_pipeline() + assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py new file mode 100644 index 0000000..4463f54 --- /dev/null +++ b/wg21_paper_tracker/tests/test_services.py @@ -0,0 +1,223 @@ +"""Tests for wg21_paper_tracker.services.""" + +from datetime import date +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, + mark_paper_downloaded, +) + + +# --- get_or_create_mailing --- + + +@pytest.mark.django_db +def test_get_or_create_mailing_creates_new(): + """get_or_create_mailing creates new mailing and returns (mailing, True).""" + m, created = get_or_create_mailing("2025-01", "2025-01 pre-meeting mailing") + assert created is True + assert m.mailing_date == "2025-01" + assert m.title == "2025-01 pre-meeting mailing" + + +@pytest.mark.django_db +def test_get_or_create_mailing_gets_existing(): + """get_or_create_mailing returns existing mailing and (mailing, False).""" + get_or_create_mailing("2025-01", "Original title") + m2, created2 = get_or_create_mailing("2025-01", "Updated title") + assert created2 is False + assert m2.mailing_date == "2025-01" + assert m2.title == "Updated title" # title is updated when different + + +@pytest.mark.django_db +def test_get_or_create_mailing_updates_title_when_different(): + """get_or_create_mailing updates title when existing has different title.""" + get_or_create_mailing("2025-02", "Old title") + m, _ = get_or_create_mailing("2025-02", "New title") + m.refresh_from_db() + assert m.title == "New title" + + +# --- get_or_create_paper --- + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +def test_get_or_create_paper_creates_new(mock_profile, db): + """get_or_create_paper creates new paper and returns (paper, True).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=date(2025, 1, 15), + mailing=mailing, + subgroup="SG1", + author_names=None, + year=2025, + ) + assert created is True + assert paper.paper_id == "p1000r0" + assert paper.title == "A paper" + assert paper.year == 2025 + assert paper.mailing_id == mailing.id + assert paper.subgroup == "SG1" + mock_profile.assert_not_called() + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): + """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" + from unittest.mock import MagicMock + profile = MagicMock() + profile.pk = 1 + mock_profile.return_value = (profile, True) + + mailing, _ = get_or_create_mailing("2025-01", "Title") + with patch("wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create") as mock_link: + mock_link.return_value = (MagicMock(), True) + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=None, + mailing=mailing, + author_names=["Alice", "Bob"], + year=2025, + ) + assert created is True + assert mock_profile.call_count == 2 + mock_profile.assert_any_call("Alice") + mock_profile.assert_any_call("Bob") + + +@pytest.mark.django_db +def test_get_or_create_paper_normalizes_paper_id_lowercase(db): + """get_or_create_paper stores paper_id in lowercase.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id=" P3039R1 ", + url="https://example.com/p3039r1.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.paper_id == "p3039r1" + + +@pytest.mark.django_db +def test_get_or_create_paper_gets_existing_and_updates(db): + """get_or_create_paper returns existing and updates fields when different.""" + mailing1, _ = get_or_create_mailing("2025-01", "M1") + mailing2, _ = get_or_create_mailing("2025-02", "M2") + get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/old.pdf", + title="Old title", + document_date=date(2025, 1, 1), + mailing=mailing1, + subgroup="SG1", + year=2025, + ) + paper2, created2 = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/new.pdf", + title="New title", + document_date=date(2025, 2, 1), + mailing=mailing2, + subgroup="SG2", + year=2025, + ) + assert created2 is False + paper2.refresh_from_db() + assert paper2.url == "https://example.com/new.pdf" + assert paper2.title == "New title" + assert paper2.mailing_id == mailing2.id + assert paper2.subgroup == "SG2" + + +@pytest.mark.django_db +def test_get_or_create_paper_year_none_stored_as_null(db): + """get_or_create_paper with year=None stores null.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="n5034", + url="https://example.com/n5034.html", + title="Draft", + document_date=None, + mailing=mailing, + year=None, + ) + assert paper.year is None + + +@pytest.mark.django_db +def test_get_or_create_paper_same_paper_id_different_year_creates_two(db): + """get_or_create_paper creates separate rows for same paper_id different year (unique_together).""" + mailing1, _ = get_or_create_mailing("2024-11", "M1") + mailing2, _ = get_or_create_mailing("2025-01", "M2") + p1, c1 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2024.pdf", + title="SD 2024", + document_date=None, + mailing=mailing1, + year=2024, + ) + p2, c2 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2025.pdf", + title="SD 2025", + document_date=None, + mailing=mailing2, + year=2025, + ) + assert c1 is True and c2 is True + assert p1.pk != p2.pk + assert p1.year == 2024 and p2.year == 2025 + + +# --- mark_paper_downloaded --- + + +@pytest.mark.django_db +def test_mark_paper_downloaded_sets_flag(db): + """mark_paper_downloaded sets is_downloaded=True for matching paper_id.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.is_downloaded is False + mark_paper_downloaded("p1000r0") + paper.refresh_from_db() + assert paper.is_downloaded is True + + +@pytest.mark.django_db +def test_mark_paper_downloaded_normalizes_paper_id(db): + """mark_paper_downloaded matches case-insensitively (normalizes to lower).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + mark_paper_downloaded(" P1000R0 ") + paper.refresh_from_db() + assert paper.is_downloaded is True diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py new file mode 100644 index 0000000..25a828e --- /dev/null +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -0,0 +1,73 @@ +"""Tests for wg21_paper_tracker.workspace.""" + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.workspace import get_workspace_root, get_raw_dir + + +@pytest.fixture +def mock_workspace_path(tmp_path): + """Patch get_workspace_path to return tmp_path for app slugs.""" + + def _get_path(app_slug): + p = tmp_path / app_slug.replace("/", "_") + p.mkdir(parents=True, exist_ok=True) + return p + + with patch("wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path): + yield tmp_path + + +def test_get_workspace_root_returns_path(mock_workspace_path): + """get_workspace_root returns Path for app workspace.""" + root = get_workspace_root() + assert "wg21_paper_tracker" in str(root) + assert root.is_dir() + + +def test_get_workspace_root_calls_get_workspace_path_with_slug(): + """get_workspace_root calls get_workspace_path with app slug.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + m.return_value = Path("/fake/workspace/wg21_paper_tracker") + root = get_workspace_root() + m.assert_called_once_with("wg21_paper_tracker") + assert root == Path("/fake/workspace/wg21_paper_tracker") + + +def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): + """get_raw_dir returns raw/wg21_paper_tracker//.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw_wg21_paper_tracker" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: { + "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", + "raw/wg21_paper_tracker": raw_root, + }[slug] + path = get_raw_dir("2025-01") + assert path == raw_root / "2025-01" + assert path.is_dir() + + +def test_get_raw_dir_creates_parents(mock_workspace_path): + """get_raw_dir creates parent directories.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw_app" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: raw_root if "raw" in slug else (mock_workspace_path / "app") + path = get_raw_dir("2026-02") + assert path.exists() + assert path.name == "2026-02" + + +def test_get_raw_dir_idempotent(mock_workspace_path): + """get_raw_dir can be called twice for same mailing_date without error.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: raw_root + p1 = get_raw_dir("2025-01") + p2 = get_raw_dir("2025-01") + assert p1 == p2 From 18f07c3536f7cf841c91d1fffb6d7b7f81827d61 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 19:22:05 -0700 Subject: [PATCH 03/76] Fix lint/format error #24 --- .../commands/import_wg21_metadata_from_csv.py | 25 +++++++++++-------- wg21_paper_tracker/tests/test_fetcher.py | 6 +++-- wg21_paper_tracker/tests/test_models.py | 2 +- wg21_paper_tracker/tests/test_pipeline.py | 23 ++++++++++++----- wg21_paper_tracker/tests/test_services.py | 6 +++-- wg21_paper_tracker/tests/test_workspace.py | 8 ++++-- 6 files changed, 46 insertions(+), 24 deletions(-) diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 5d4a398..966ce64 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -41,11 +41,7 @@ def _normalize_title(raw: str) -> str: if not raw: return "" one_line = " ".join(raw.split()) - return ( - one_line[:TITLE_MAX_LENGTH] - if len(one_line) > TITLE_MAX_LENGTH - else one_line - ) + return one_line[:TITLE_MAX_LENGTH] if len(one_line) > TITLE_MAX_LENGTH else one_line def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]: @@ -117,9 +113,7 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): - csv_path = options.get("csv_file") or ( - get_workspace_root() / "metadata.csv" - ) + csv_path = options.get("csv_file") or (get_workspace_root() / "metadata.csv") dry_run = options["dry_run"] if not csv_path.exists(): @@ -224,16 +218,25 @@ def handle(self, *args, **options): paper.year = year paper.save() if author_names: - from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + from cppa_user_tracker.services import ( + get_or_create_wg21_paper_author_profile, + ) + for name in author_names: - profile, _ = get_or_create_wg21_paper_author_profile(name) + profile, _ = get_or_create_wg21_paper_author_profile( + name + ) WG21PaperAuthor.objects.get_or_create( paper=paper, profile=profile, ) except Exception as inner: stats["skipped"] += 1 - logger.error("Error for paper_id=%s (after IntegrityError): %s", paper_id, inner) + logger.error( + "Error for paper_id=%s (after IntegrityError): %s", + paper_id, + inner, + ) except Exception as e: stats["skipped"] += 1 logger.error("Error for paper_id=%s: %s", paper_id, e) diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index a06317a..70a2338 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -2,7 +2,6 @@ from unittest.mock import patch, MagicMock -import pytest from wg21_paper_tracker.fetcher import ( BASE_URL, @@ -174,6 +173,9 @@ def test_fetch_papers_for_mailing_returns_empty_when_no_table(): def test_fetch_papers_for_mailing_calls_year_url(): """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + m.return_value = MagicMock( + text="", + raise_for_status=MagicMock(), + ) fetch_papers_for_mailing("2025", "2025-01") m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py index ca92819..5d9a1ac 100644 --- a/wg21_paper_tracker/tests/test_models.py +++ b/wg21_paper_tracker/tests/test_models.py @@ -4,7 +4,7 @@ import pytest -from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.models import WG21Mailing, WG21Paper @pytest.mark.django_db diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index e052ce9..ad4df9c 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,6 +1,5 @@ """Tests for wg21_paper_tracker.pipeline.""" -import time from pathlib import Path from unittest.mock import patch, MagicMock @@ -108,13 +107,16 @@ def test_run_tracker_pipeline_returns_zero_when_no_mailings(): def test_run_tracker_pipeline_skips_when_no_new_mailings(): """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m: m.return_value = [ {"mailing_date": "2025-01", "title": "Old", "year": "2025"}, {"mailing_date": "2025-02", "title": "Latest", "year": "2025"}, ] - with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[]): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[] + ): n = run_tracker_pipeline() assert n == 0 @@ -123,6 +125,7 @@ def test_run_tracker_pipeline_skips_when_no_new_mailings(): def test_run_tracker_pipeline_downloads_new_papers(tmp_path): """run_tracker_pipeline downloads papers for new mailings and returns count.""" from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") mailings = [ {"mailing_date": "2025-01", "title": "Previous", "year": "2025"}, @@ -141,9 +144,17 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): }, ] with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): - with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers): - with patch("wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path): - with patch("wg21_paper_tracker.pipeline._download_file", return_value=True): - with patch("wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + with patch( + "wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path + ): + with patch( + "wg21_paper_tracker.pipeline._download_file", return_value=True + ): + with patch( + "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None + ): n = run_tracker_pipeline() assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 4463f54..a0a9b6f 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -5,7 +5,6 @@ import pytest -from wg21_paper_tracker.models import WG21Mailing, WG21Paper from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, @@ -76,12 +75,15 @@ def test_get_or_create_paper_creates_new(mock_profile, db): def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" from unittest.mock import MagicMock + profile = MagicMock() profile.pk = 1 mock_profile.return_value = (profile, True) mailing, _ = get_or_create_mailing("2025-01", "Title") - with patch("wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create") as mock_link: + with patch( + "wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create" + ) as mock_link: mock_link.return_value = (MagicMock(), True) paper, created = get_or_create_paper( paper_id="p1000r0", diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 25a828e..3689ae9 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -17,7 +17,9 @@ def _get_path(app_slug): p.mkdir(parents=True, exist_ok=True) return p - with patch("wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path): + with patch( + "wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path + ): yield tmp_path @@ -56,7 +58,9 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: raw_root = mock_workspace_path / "raw_app" raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: raw_root if "raw" in slug else (mock_workspace_path / "app") + m.side_effect = lambda slug: ( + raw_root if "raw" in slug else (mock_workspace_path / "app") + ) path = get_raw_dir("2026-02") assert path.exists() assert path.name == "2026-02" From f4388ffabb8fd007768d877125518a9afbab4788 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 13:51:22 -0700 Subject: [PATCH 04/76] Validate mailing_date in get_raw_dir; WG21 author order/resolution and docs #24 --- ...0006_wg21paperauthorprofile_author_alas.py | 19 +++++ cppa_user_tracker/models.py | 1 + cppa_user_tracker/services.py | 34 +++++++- cppa_user_tracker/tests/test_services.py | 77 +++++++++++++++++++ docs/Schema.md | 15 ++-- docs/operations/WG21_Cloud_Run.md | 4 +- docs/service_api/cppa_user_tracker.md | 8 ++ wg21_paper_tracker/cloud_run_job/Dockerfile | 6 +- .../converters/openai_converter.py | 23 +++--- .../converters/pdfplumber_converter.py | 13 +++- wg21_paper_tracker/cloud_run_job/main.py | 4 +- .../cloud_run_job/requirements.txt | 2 +- wg21_paper_tracker/fetcher.py | 8 +- .../commands/import_wg21_metadata_from_csv.py | 10 +-- .../commands/run_wg21_paper_tracker.py | 6 +- wg21_paper_tracker/migrations/0001_initial.py | 8 +- wg21_paper_tracker/models.py | 5 +- wg21_paper_tracker/pipeline.py | 20 ++++- wg21_paper_tracker/services.py | 54 ++++++++++--- wg21_paper_tracker/tests/test_fetcher.py | 13 ++-- wg21_paper_tracker/tests/test_pipeline.py | 5 +- wg21_paper_tracker/tests/test_services.py | 71 +++++++++++------ wg21_paper_tracker/tests/test_workspace.py | 20 ++++- wg21_paper_tracker/workspace.py | 4 + 24 files changed, 338 insertions(+), 92 deletions(-) create mode 100644 cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py new file mode 100644 index 0000000..9c47bb5 --- /dev/null +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py @@ -0,0 +1,19 @@ +# Generated by Django 4.2.28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ] + + operations = [ + migrations.AddField( + model_name="wg21paperauthorprofile", + name="author_alas", + field=models.CharField(blank=True, db_index=True, default="", max_length=255), + preserve_default=True, + ), + ] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 46be627..75a52c6 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -165,6 +165,7 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) display_name = models.CharField(max_length=255, db_index=True, blank=True) + author_alas = models.CharField(max_length=255, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 35b4e31..146f778 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -357,9 +357,35 @@ def get_or_create_discord_profile( def get_or_create_wg21_paper_author_profile( display_name: str, -) -> tuple[Any, bool]: - """Get or create a WG21PaperAuthorProfile by display_name.""" + email: Optional[str] = None, +) -> tuple[WG21PaperAuthorProfile, bool]: + """Get or create a WG21PaperAuthorProfile by display_name, with optional email disambiguation. + + Finds all profiles with the given display_name. If none exist, creates one and adds + email if provided. If one exists, returns it. If multiple exist, and email is + provided, returns the one with that email if any; otherwise returns the first. + """ display_name_val = (display_name or "").strip() - return WG21PaperAuthorProfile.objects.get_or_create( - display_name=display_name_val, + email_val = (email or "").strip() or None + + candidates = list( + WG21PaperAuthorProfile.objects.filter(display_name=display_name_val).order_by( + "id" + ) ) + + if not candidates: + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) + if email_val: + add_email(profile, email_val, is_primary=True) + return profile, True + + if len(candidates) == 1: + return candidates[0], False + + # Two or more: disambiguate by email if provided + if email_val: + for p in candidates: + if p.emails.filter(email=email_val).exists(): + return p, False + return candidates[0], False diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index cf61481..6d4d85b 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -8,6 +8,7 @@ GitHubAccountType, Identity, TempProfileIdentityRelation, + WG21PaperAuthorProfile, ) from cppa_user_tracker import services @@ -569,3 +570,79 @@ def test_get_or_create_mailing_list_profile_strips_display_name_and_email(): assert created is True assert profile.display_name == "Trimmed" assert profile.emails.filter(email="trimmed@example.com").exists() + + +# --- get_or_create_wg21_paper_author_profile --- + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_creates(): + """get_or_create_wg21_paper_author_profile creates new profile when none exist.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="New Author" + ) + assert created is True + assert profile.display_name == "New Author" + assert WG21PaperAuthorProfile.objects.filter(display_name="New Author").count() == 1 + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_with_email_adds_email(): + """get_or_create_wg21_paper_author_profile adds email to new profile when provided.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Author With Email", + email="author@example.com", + ) + assert created is True + assert profile.emails.filter(email="author@example.com").exists() + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): + """get_or_create_wg21_paper_author_profile returns existing profile when exactly one matches.""" + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author" + ) + assert created is False + assert profile.id == existing.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first(): + """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + _second = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Dup Name" + ) + assert created is False + assert profile.id == first.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_second(): + """get_or_create_wg21_paper_author_profile returns profile with matching email when multiple match.""" + _first = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + services.add_email(second, "match@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Same Name", + email="match@example.com", + ) + assert created is False + assert profile.id == second.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_returns_first(): + """get_or_create_wg21_paper_author_profile returns first when email provided but no match.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + services.add_email(second, "other@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Other Name", + email="nomatch@example.com", + ) + assert created is False + assert profile.id == first.id diff --git a/docs/Schema.md b/docs/Schema.md index 308a662..6b22def 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -68,6 +68,7 @@ erDiagram WG21PaperAuthorProfile { string display_name "IX" + string author_alas "IX" datetime created_at datetime updated_at } @@ -618,6 +619,7 @@ erDiagram int id PK int paper_id FK int profile_id FK + int author_order datetime created_at } @@ -631,7 +633,8 @@ erDiagram WG21Paper { int id PK - string paper_id UK "IX" + string paper_id "IX" + int year "IX" string url string title "IX" date document_date "IX" @@ -647,7 +650,9 @@ erDiagram **Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing. -**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. +**Note:** **WG21Paper** is uniquely identified by the composite `(paper_id, year)`; `paper_id` is not globally unique. The same paper identifier may appear in different years (e.g. revisions). + +**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. `author_order` is optional and 1-based; it indicates the order of authors on the paper. --- @@ -720,7 +725,7 @@ erDiagram | **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | | **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | -| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | +| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alas`. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | | **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | @@ -761,8 +766,8 @@ erDiagram | **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | | **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | | **WG21Mailing** | WG21 mailing release (mailing_date, title). | 7 | -| **WG21Paper** | WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded). | 7 | -| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | +| **WG21Paper** | WG21 paper (paper_id, year, url, title, document_date, mailing, subgroup, is_downloaded). Unique on (paper_id, year). | 7 | +| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id→WG21PaperAuthorProfile). Optional `author_order` (1-based) for ordering. | 7 | | **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | | **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | | **WebsiteWordCount** | Per-date, per-word count. | 8 | diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index 257e2bc..7840bd1 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -44,9 +44,11 @@ gcloud run jobs create wg21-convert \ --memory 8Gi \ --cpu 4 \ --region us-central1 \ - --set-env-vars WG21_GCS_BUCKET=wg21-data-collector,OPENROUTER_API_KEY=your_key + --set-env-vars WG21_GCS_BUCKET=wg21-data-collector ``` +Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manager](https://cloud.google.com/run/docs/configuring/secrets)) rather than inline in `--set-env-vars`, to avoid leaking the key into shell history, CI logs, or audit trails. + ## 4. Service Account & IAM Permissions 1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index f638501..4ca0adb 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -41,6 +41,14 @@ --- +## WG21PaperAuthorProfile + +| Function | Parameter types | Return type | Description | +| -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name. If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist, and email is provided, returns the one with that email if any; otherwise returns the first. Use this when linking paper authors so that same name + same email link to the same profile. | + +--- + ## DiscordProfile | Function | Parameter types | Return type | Description | diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile index 21b51ef..d52244b 100644 --- a/wg21_paper_tracker/cloud_run_job/Dockerfile +++ b/wg21_paper_tracker/cloud_run_job/Dockerfile @@ -11,6 +11,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libglib2.0-0 \ && rm -rf /var/lib/apt/lists/* +RUN groupadd -r app && useradd -r -g app app + # Copy requirements COPY requirements.txt . @@ -18,7 +20,9 @@ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Copy application files -COPY . . +COPY --chown=app:app . . + +USER app # Run the main script CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index cd168aa..3a94230 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -2,12 +2,16 @@ OpenAI/OpenRouter-based PDF to Markdown converter with OCR. """ -import os +from __future__ import annotations + import base64 +import io +import logging +import os from pathlib import Path from typing import Optional + import requests -import logging logger = logging.getLogger(__name__) @@ -21,7 +25,6 @@ try: from pdf2image import convert_from_path from PIL import Image, ImageOps - import io PDF2IMAGE_AVAILABLE = True except ImportError: @@ -211,6 +214,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: total_pages = len(images) markdown_parts = [] + successful_pages = 0 # Process each page for page_num, image in enumerate(images, 1): @@ -226,6 +230,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: if page_markdown: markdown_parts.append(page_markdown) markdown_parts.append("\n\n") + successful_pages += 1 else: logger.warning(f"Failed to convert page {page_num} with OpenAI") markdown_parts.append( @@ -243,17 +248,17 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: markdown_content = "".join(markdown_parts) - if markdown_content and len(markdown_content.strip()) > 0: + if successful_pages > 0 and markdown_content.strip(): logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") logger.info( f"Extracted {len(markdown_content)} characters from {total_pages} pages" ) return markdown_content - else: - logger.warning( - f"OpenAI/OpenRouter conversion returned empty content for: {pdf_path.name}" - ) - return None + logger.warning( + "OpenAI/OpenRouter conversion produced no usable pages for: %s", + pdf_path.name, + ) + return None except Exception as e: logger.error( diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 58a1465..6329c5a 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -46,7 +46,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: text = page.extract_text() if text: - markdown_parts.append(text) + markdown_parts.append(text.replace("\n", " \n")) markdown_parts.append("\n\n") # Extract tables if any @@ -55,6 +55,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: for table in tables: if table: markdown_parts.append("\n### Table\n\n") + first_row = True # Convert table to markdown format for row in table: if row: @@ -66,6 +67,13 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: ) + " |\n" ) + if first_row: + markdown_parts.append( + "| " + + " | ".join("---" for _ in row) + + " |\n" + ) + first_row = False markdown_parts.append("\n") except Exception as e: @@ -88,6 +96,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: except Exception as e: logger.error( - f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", + exc_info=True, ) return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index cf704ae..cdfe40a 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -2,6 +2,8 @@ import logging from pathlib import Path import tempfile +from typing import Optional + from google.cloud import storage from converters.docling_converter import convert_with_docling @@ -16,7 +18,7 @@ MIN_CONTENT_LENGTH = 50 -def is_content_valid(content: str) -> bool: +def is_content_valid(content: Optional[str]) -> bool: if not content: return False content_stripped = content.strip() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt index 0a00731..096efc5 100644 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -1,6 +1,6 @@ docling>=1.0.0 pdfplumber>=0.10.0 pdf2image>=1.16.0 -Pillow>=10.0.0 +Pillow>=10.3.0 requests>=2.31.0 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index e733e83..4e44bd0 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -29,8 +29,8 @@ def fetch_all_mailings() -> list[dict]: try: response = requests.get(f"{BASE_URL}/", timeout=30) response.raise_for_status() - except Exception as e: - logger.error("Failed to fetch WG21 index: %s", e) + except requests.RequestException: + logger.error("Failed to fetch WG21 index.") return [] # The mailings are listed in a markdown-like syntax or links @@ -65,8 +65,8 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: try: response = requests.get(url, timeout=30) response.raise_for_status() - except Exception as e: - logger.error("Failed to fetch year page %s: %s", year, e) + except requests.RequestException: + logger.error("Failed to fetch year page %s.", year) return [] soup = BeautifulSoup(response.text, "html.parser") diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 966ce64..0d1b903 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -16,10 +16,11 @@ from django.db import IntegrityError from django.utils.dateparse import parse_date -from wg21_paper_tracker.models import WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.models import WG21Paper from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, + get_or_create_paper_author, ) from wg21_paper_tracker.workspace import get_workspace_root @@ -222,14 +223,11 @@ def handle(self, *args, **options): get_or_create_wg21_paper_author_profile, ) - for name in author_names: + for i, name in enumerate(author_names): profile, _ = get_or_create_wg21_paper_author_profile( name ) - WG21PaperAuthor.objects.get_or_create( - paper=paper, - profile=profile, - ) + get_or_create_paper_author(paper, profile, i + 1) except Exception as inner: stats["skipped"] += 1 logger.error( diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index f771043..3945269 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -65,9 +65,9 @@ def handle(self, *args, **options): logger.info("Downloaded and uploaded %d new papers.", total_new_papers) if total_new_papers > 0: - project_id = settings.GCP_PROJECT_ID - location = settings.GCP_LOCATION - job_name = settings.WG21_CLOUD_RUN_JOB_NAME + project_id = getattr(settings, "GCP_PROJECT_ID", None) + location = getattr(settings, "GCP_LOCATION", "us-central1") + job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) if project_id and job_name: try: diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index b4f9635..a2bbf3d 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Merged initial migration: WG21 Mailing, WG21 Paper (with year), WG21 Paper Author +# Merged initial migration: WG21 Mailing, WG21 Paper (year not null), WG21 Paper Author from django.db import migrations, models import django.db.models.deletion @@ -59,10 +59,7 @@ class Migration(migrations.Migration): "document_date", models.DateField(blank=True, db_index=True, null=True), ), - ( - "year", - models.IntegerField(blank=True, db_index=True, null=True), - ), + ("year", models.IntegerField(db_index=True, default=0)), ( "subgroup", models.CharField( @@ -104,6 +101,7 @@ class Migration(migrations.Migration): verbose_name="ID", ), ), + ("author_order", models.PositiveIntegerField(blank=True, null=True)), ("created_at", models.DateTimeField(auto_now_add=True)), ( "paper", diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py index 44754b4..fede57b 100644 --- a/wg21_paper_tracker/models.py +++ b/wg21_paper_tracker/models.py @@ -30,7 +30,7 @@ class WG21Paper(models.Model): url = models.URLField(max_length=1024) title = models.CharField(max_length=1024, db_index=True) document_date = models.DateField(db_index=True, null=True, blank=True) - year = models.IntegerField(null=True, blank=True, db_index=True) + year = models.IntegerField(default=0, db_index=True) mailing = models.ForeignKey( WG21Mailing, on_delete=models.CASCADE, @@ -42,7 +42,7 @@ class WG21Paper(models.Model): updated_at = models.DateTimeField(auto_now=True) class Meta: - unique_together = (("paper_id", "year"),) + unique_together = [["paper_id", "year"]] ordering = ["-document_date", "-paper_id", "-year"] verbose_name = "WG21 Paper" verbose_name_plural = "WG21 Papers" @@ -66,6 +66,7 @@ class WG21PaperAuthor(models.Model): related_name="papers", db_column="profile_id", ) + author_order = models.PositiveIntegerField(null=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) class Meta: diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index edcf003..16a9bb6 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -186,10 +186,14 @@ def format_priority(ext: str) -> int: raw_dir = get_raw_dir(mailing_date) skipped_downloaded = 0 + year_val = year if year is not None else 0 for pid, p_list in papers_by_id.items(): - # Check DB if this paper_id is already fully downloaded - existing_paper = WG21Paper.objects.filter(paper_id=pid).first() - if existing_paper and existing_paper.is_downloaded: + # Skip only if this (paper_id, year) is already downloaded + if WG21Paper.objects.filter( + paper_id=pid, + year=year_val, + is_downloaded=True, + ).exists(): skipped_downloaded += 1 continue @@ -197,7 +201,15 @@ def format_priority(ext: str) -> int: p_list.sort(key=lambda x: format_priority(x["type"])) best_paper = p_list[0] - filename = best_paper["filename"] + raw_filename = (best_paper.get("filename") or "").strip() + filename = Path(raw_filename).name + if not filename or filename != raw_filename: + logger.warning( + "Skipping paper %s due to unsafe filename %r", + pid, + raw_filename, + ) + continue local_path = raw_dir / filename url = best_paper["url"] diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index cf846b0..4328711 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -2,13 +2,18 @@ Database logic for WG21 Paper Tracker. """ -from typing import Optional +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional from django.db import transaction from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor +if TYPE_CHECKING: + from cppa_user_tracker.models import WG21PaperAuthorProfile + @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: @@ -30,10 +35,11 @@ def get_or_create_paper( mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, + author_emails: Optional[list[str]] = None, year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() - year_val = None + year_val = 0 if year: s = (year if isinstance(year, str) else str(year)).strip()[:4] if s.isdigit(): @@ -66,23 +72,49 @@ def get_or_create_paper( if paper.subgroup != subgroup: paper.subgroup = subgroup updated = True - if year_val is not None and paper.year != year_val: + if paper.year != year_val: paper.year = year_val updated = True if updated: paper.save() if author_names: - for name in author_names: - profile, _ = get_or_create_wg21_paper_author_profile(name) - WG21PaperAuthor.objects.get_or_create( - paper=paper, - profile=profile, - ) + emails = author_emails or [] + for i, name in enumerate(author_names): + email = emails[i] if i < len(emails) else None + profile, _ = get_or_create_wg21_paper_author_profile(name, email=email) + get_or_create_paper_author(paper, profile, i + 1) return paper, created -def mark_paper_downloaded(paper_id: str): +def get_or_create_paper_author( + paper: WG21Paper, + profile: WG21PaperAuthorProfile, + author_order: int, +) -> tuple[WG21PaperAuthor, bool]: + """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based). + Updates author_order on existing link if it differs. + """ + link, link_created = WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + defaults={"author_order": author_order}, + ) + if not link_created and link.author_order != author_order: + link.author_order = author_order + link.save(update_fields=["author_order"]) + return link, link_created + + +def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() - WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) + year_val = 0 + if year is not None: + s = (year if isinstance(year, str) else str(year)).strip()[:4] + if s.isdigit(): + year_val = int(s) + WG21Paper.objects.filter( + paper_id=paper_id, + year=year_val, + ).update(is_downloaded=True) diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 70a2338..8b2ffec 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -2,6 +2,7 @@ from unittest.mock import patch, MagicMock +import requests from wg21_paper_tracker.fetcher import ( BASE_URL, @@ -14,18 +15,18 @@ def test_fetch_all_mailings_returns_empty_on_request_failure(): - """fetch_all_mailings returns [] when requests.get raises.""" + """fetch_all_mailings returns [] when requests.get raises RequestException.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.side_effect = Exception("network error") + m.side_effect = requests.RequestException("network error") result = fetch_all_mailings() assert result == [] def test_fetch_all_mailings_returns_empty_on_http_error(): - """fetch_all_mailings returns [] when response.raise_for_status raises.""" + """fetch_all_mailings returns [] when response.raise_for_status raises HTTPError.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: resp = MagicMock() - resp.raise_for_status.side_effect = Exception("404") + resp.raise_for_status.side_effect = requests.HTTPError("404") m.return_value = resp result = fetch_all_mailings() assert result == [] @@ -68,9 +69,9 @@ def test_fetch_all_mailings_calls_index_url(): def test_fetch_papers_for_mailing_returns_empty_on_request_failure(): - """fetch_papers_for_mailing returns [] when requests.get raises.""" + """fetch_papers_for_mailing returns [] when requests.get raises RequestException.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.side_effect = Exception("timeout") + m.side_effect = requests.RequestException("timeout") result = fetch_papers_for_mailing("2025", "2025-01") assert result == [] diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index ad4df9c..592ceec 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -4,6 +4,7 @@ from unittest.mock import patch, MagicMock import pytest +import requests from wg21_paper_tracker.pipeline import ( DOWNLOAD_TIMEOUT, @@ -66,7 +67,7 @@ def test_download_file_retries_on_failure(tmp_path): url = "https://example.com/f" filepath = tmp_path / "f" with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = Exception("connection error") + m.side_effect = requests.RequestException("connection error") with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: result = _download_file(url, filepath) assert result is False @@ -84,7 +85,7 @@ def test_download_file_succeeds_on_second_attempt(tmp_path): resp.content = b"ok" resp.apparent_encoding = "utf-8" with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = [Exception("first fail"), resp] + m.side_effect = [requests.RequestException("first fail"), resp] with patch("wg21_paper_tracker.pipeline.time.sleep"): result = _download_file(url, filepath) assert result is True diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index a0a9b6f..6ec6a00 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -72,32 +72,35 @@ def test_get_or_create_paper_creates_new(mock_profile, db): @pytest.mark.django_db @patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") -def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): - """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" +@patch("wg21_paper_tracker.services.get_or_create_paper_author") +def test_get_or_create_paper_calls_author_profile_for_each_author( + mock_get_or_create_paper_author, mock_profile, db +): + """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author.""" from unittest.mock import MagicMock profile = MagicMock() profile.pk = 1 mock_profile.return_value = (profile, True) + mock_get_or_create_paper_author.return_value = (MagicMock(), True) mailing, _ = get_or_create_mailing("2025-01", "Title") - with patch( - "wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create" - ) as mock_link: - mock_link.return_value = (MagicMock(), True) - paper, created = get_or_create_paper( - paper_id="p1000r0", - url="https://example.com/p1000r0.pdf", - title="A paper", - document_date=None, - mailing=mailing, - author_names=["Alice", "Bob"], - year=2025, - ) + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=None, + mailing=mailing, + author_names=["Alice", "Bob"], + year=2025, + ) assert created is True assert mock_profile.call_count == 2 - mock_profile.assert_any_call("Alice") - mock_profile.assert_any_call("Bob") + mock_profile.assert_any_call("Alice", email=None) + mock_profile.assert_any_call("Bob", email=None) + assert mock_get_or_create_paper_author.call_count == 2 + mock_get_or_create_paper_author.assert_any_call(paper, profile, 1) + mock_get_or_create_paper_author.assert_any_call(paper, profile, 2) @pytest.mark.django_db @@ -147,8 +150,8 @@ def test_get_or_create_paper_gets_existing_and_updates(db): @pytest.mark.django_db -def test_get_or_create_paper_year_none_stored_as_null(db): - """get_or_create_paper with year=None stores null.""" +def test_get_or_create_paper_year_none_stored_as_zero(db): + """get_or_create_paper with year=None stores 0 for unknown year.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="n5034", @@ -158,7 +161,7 @@ def test_get_or_create_paper_year_none_stored_as_null(db): mailing=mailing, year=None, ) - assert paper.year is None + assert paper.year == 0 @pytest.mark.django_db @@ -187,12 +190,32 @@ def test_get_or_create_paper_same_paper_id_different_year_creates_two(db): assert p1.year == 2024 and p2.year == 2025 +@pytest.mark.django_db +def test_get_or_create_paper_sets_author_order(db): + """get_or_create_paper sets author_order (1-based) on WG21PaperAuthor links.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p9999", + url="https://example.com/p9999.pdf", + title="Multi-author paper", + document_date=None, + mailing=mailing, + author_names=["First Author", "Second Author", "Third Author"], + year=2025, + ) + links = list(paper.authors.order_by("author_order")) + assert len(links) == 3 + assert links[0].author_order == 1 + assert links[1].author_order == 2 + assert links[2].author_order == 3 + + # --- mark_paper_downloaded --- @pytest.mark.django_db def test_mark_paper_downloaded_sets_flag(db): - """mark_paper_downloaded sets is_downloaded=True for matching paper_id.""" + """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year).""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="p1000r0", @@ -203,14 +226,14 @@ def test_mark_paper_downloaded_sets_flag(db): year=2025, ) assert paper.is_downloaded is False - mark_paper_downloaded("p1000r0") + mark_paper_downloaded("p1000r0", year=2025) paper.refresh_from_db() assert paper.is_downloaded is True @pytest.mark.django_db def test_mark_paper_downloaded_normalizes_paper_id(db): - """mark_paper_downloaded matches case-insensitively (normalizes to lower).""" + """mark_paper_downloaded matches case-insensitively (normalizes to lower) and by year.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="p1000r0", @@ -220,6 +243,6 @@ def test_mark_paper_downloaded_normalizes_paper_id(db): mailing=mailing, year=2025, ) - mark_paper_downloaded(" P1000R0 ") + mark_paper_downloaded(" P1000R0 ", year=2025) paper.refresh_from_db() assert paper.is_downloaded is True diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 3689ae9..8c8365e 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -18,7 +18,8 @@ def _get_path(app_slug): return p with patch( - "wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path + "wg21_paper_tracker.workspace.get_workspace_path", + side_effect=_get_path, ): yield tmp_path @@ -75,3 +76,20 @@ def test_get_raw_dir_idempotent(mock_workspace_path): p1 = get_raw_dir("2025-01") p2 = get_raw_dir("2025-01") assert p1 == p2 + assert p1.parent == p2.parent + + +def test_get_raw_dir_rejects_invalid_mailing_date(): + """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.).""" + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("../../tmp") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-1") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-13") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-00") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("") diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 19c0d1b..89b853b 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -3,12 +3,14 @@ Temporary file storage during download before uploading to GCS. """ +import re from pathlib import Path from config.workspace import get_workspace_path _APP_SLUG = "wg21_paper_tracker" _RAW_APP_SLUG = f"raw/{_APP_SLUG}" +_MAILING_DATE_RE = re.compile(r"^\d{4}-(0[1-9]|1[0-2])$") def get_workspace_root() -> Path: @@ -17,6 +19,8 @@ def get_workspace_root() -> Path: def get_raw_dir(mailing_date: str) -> Path: """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" + if not _MAILING_DATE_RE.fullmatch(mailing_date): + raise ValueError("mailing_date must be in YYYY-MM format") raw_root = get_workspace_path(_RAW_APP_SLUG) path = raw_root / mailing_date path.mkdir(parents=True, exist_ok=True) From 62d5d427aff6c64c66ce1572110c7291b809f2c0 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 14:39:05 -0700 Subject: [PATCH 05/76] Fix: WG21 tracker (year, GCS guard, IntegrityError), author_alias, Pillow, test fixes #24 --- ...06_wg21paperauthorprofile_author_alias.py} | 2 +- cppa_user_tracker/models.py | 2 +- docs/Schema.md | 4 +- .../converters/openai_converter.py | 2 +- wg21_paper_tracker/cloud_run_job/main.py | 2 +- .../cloud_run_job/requirements.txt | 2 +- .../commands/import_wg21_metadata_from_csv.py | 7 +++- .../commands/run_wg21_paper_tracker.py | 11 ++++-- wg21_paper_tracker/pipeline.py | 38 +++++++++++++++---- wg21_paper_tracker/tests/test_models.py | 12 +++++- wg21_paper_tracker/tests/test_pipeline.py | 9 ++++- wg21_paper_tracker/tests/test_services.py | 17 ++++++--- 12 files changed, 79 insertions(+), 29 deletions(-) rename cppa_user_tracker/migrations/{0006_wg21paperauthorprofile_author_alas.py => 0006_wg21paperauthorprofile_author_alias.py} (93%) diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py similarity index 93% rename from cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py rename to cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py index 9c47bb5..674176a 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py @@ -12,7 +12,7 @@ class Migration(migrations.Migration): operations = [ migrations.AddField( model_name="wg21paperauthorprofile", - name="author_alas", + name="author_alias", field=models.CharField(blank=True, db_index=True, default="", max_length=255), preserve_default=True, ), diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 75a52c6..70dca3d 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -165,7 +165,7 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) display_name = models.CharField(max_length=255, db_index=True, blank=True) - author_alas = models.CharField(max_length=255, blank=True, db_index=True) + author_alias = models.CharField(max_length=255, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/docs/Schema.md b/docs/Schema.md index 6b22def..12e676d 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -68,7 +68,7 @@ erDiagram WG21PaperAuthorProfile { string display_name "IX" - string author_alas "IX" + string author_alias "IX" datetime created_at datetime updated_at } @@ -725,7 +725,7 @@ erDiagram | **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | | **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | -| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alas`. | 1 | +| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alias`. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | | **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 3a94230..078e984 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -242,7 +242,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: f"Error processing page {page_num}: {str(e)}", exc_info=True ) markdown_parts.append( - f"## Page {page_num}\n\n*[Error processing this page: {str(e)}]*\n\n" + f"## Page {page_num}\n\n*[Error processing this page]*\n\n" ) continue diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index cdfe40a..e1a0153 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -37,7 +37,7 @@ def is_content_valid(content: Optional[str]) -> bool: first_part = content_lower[:1000] for pattern in error_patterns: if pattern in first_part: - if pattern.startswith("error:") or pattern.startswith("exception:"): + if pattern in ("error:", "exception:"): return False idx = first_part.find(pattern) if idx < 100: diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt index 096efc5..82422b1 100644 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -1,6 +1,6 @@ docling>=1.0.0 pdfplumber>=0.10.0 pdf2image>=1.16.0 -Pillow>=10.3.0 +Pillow>=12.1.1 requests>=2.31.0 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 0d1b903..fc45d7f 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -202,10 +202,13 @@ def handle(self, *args, **options): else: stats["papers_updated"] += 1 except IntegrityError as e: - # Duplicate (paper_id) or (paper_id, year): fetch existing and update + # Duplicate (paper_id, year): fetch existing by same key and update stats["papers_updated"] += 1 try: - paper = WG21Paper.objects.filter(paper_id=paper_id).first() + lookup_year = year if year is not None else 0 + paper = WG21Paper.objects.filter( + paper_id=paper_id, year=lookup_year + ).first() if paper is None: stats["skipped"] += 1 logger.error("Error for paper_id=%s: %s", paper_id, e) diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index 3945269..bfbb838 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -51,7 +51,8 @@ def handle(self, *args, **options): With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0 and GCP_PROJECT_ID and WG21_CLOUD_RUN_JOB_NAME are set. + total_new_papers > 0 and GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and + WG21_GCS_BUCKET are set (trigger is skipped if GCS upload is disabled). """ dry_run = options.get("dry_run", False) if dry_run: @@ -62,14 +63,15 @@ def handle(self, *args, **options): try: total_new_papers = run_tracker_pipeline() - logger.info("Downloaded and uploaded %d new papers.", total_new_papers) + logger.info("Processed %d new papers.", total_new_papers) if total_new_papers > 0: project_id = getattr(settings, "GCP_PROJECT_ID", None) location = getattr(settings, "GCP_LOCATION", "us-central1") job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) + bucket = getattr(settings, "WG21_GCS_BUCKET", None) - if project_id and job_name: + if project_id and job_name and bucket: try: trigger_cloud_run_job(project_id, location, job_name) logger.info( @@ -79,7 +81,8 @@ def handle(self, *args, **options): logger.error("Failed to trigger Cloud Run job: %s", e) else: logger.warning( - "GCP_PROJECT_ID not configured. Skipping Cloud Run trigger." + "Skipping Cloud Run trigger because GCP_PROJECT_ID, " + "WG21_CLOUD_RUN_JOB_NAME, or WG21_GCS_BUCKET is not configured." ) else: logger.info("No new papers found. Skipping Cloud Run job.") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 16a9bb6..e0b4f87 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -157,13 +157,36 @@ def run_tracker_pipeline() -> int: for m_info in new_mailings: mailing_date = m_info["mailing_date"] title = m_info["title"] - year = int(m_info["year"]) if m_info["year"] else None + # Normalize year once; use 0 when missing/empty/unparseable so you can fix later + year_raw = m_info.get("year") + if not year_raw or not str(year_raw).strip(): + year = 0 + logger.warning( + "Mailing %s: year missing or empty, using 0 (fix later).", + mailing_date, + ) + else: + try: + year = int(str(year_raw).strip()[:4]) + if year <= 0: + year = 0 + logger.warning( + "Mailing %s: year invalid, using 0 (fix later).", + mailing_date, + ) + except (ValueError, TypeError): + year = 0 + logger.warning( + "Mailing %s: year not parseable %r, using 0 (fix later).", + mailing_date, + year_raw, + ) # Create/get mailing in DB mailing_obj, _ = get_or_create_mailing(mailing_date, title) # Fetch papers for this mailing - papers = fetch_papers_for_mailing(year, mailing_date) + papers = fetch_papers_for_mailing(str(year), mailing_date) if not papers: logger.info( "Mailing %s: no papers found (anchor/table may be missing).", @@ -180,18 +203,17 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} return priorities.get(ext.lower(), 100) raw_dir = get_raw_dir(mailing_date) skipped_downloaded = 0 - year_val = year if year is not None else 0 for pid, p_list in papers_by_id.items(): # Skip only if this (paper_id, year) is already downloaded if WG21Paper.objects.filter( paper_id=pid, - year=year_val, + year=year, is_downloaded=True, ).exists(): skipped_downloaded += 1 @@ -220,8 +242,10 @@ def format_priority(ext: str) -> int: gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) else: - # If no GCS, simulate success so DB is updated - uploaded = True + logger.warning( + "WG21_GCS_BUCKET is not configured; leaving %s as not downloaded.", + pid, + ) # Persist DB doc_date_str = best_paper["document_date"] diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py index 5d9a1ac..9b4ee7e 100644 --- a/wg21_paper_tracker/tests/test_models.py +++ b/wg21_paper_tracker/tests/test_models.py @@ -3,6 +3,7 @@ from datetime import date import pytest +from django.db import IntegrityError, transaction from wg21_paper_tracker.models import WG21Mailing, WG21Paper @@ -56,7 +57,7 @@ def test_wg21_mailing_ordering(): @pytest.mark.django_db def test_wg21_paper_unique_together_paper_id_year(): - """WG21Paper allows same paper_id with different year.""" + """WG21Paper allows same paper_id with different year; rejects duplicate (paper_id, year).""" m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1") m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2") WG21Paper.objects.create( @@ -66,6 +67,15 @@ def test_wg21_paper_unique_together_paper_id_year(): mailing=m1, year=2024, ) + with pytest.raises(IntegrityError): + with transaction.atomic(): + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/dup.pdf", + title="T1 dup", + mailing=m1, + year=2024, + ) p2 = WG21Paper.objects.create( paper_id="sd-1", url="https://example.com/2.pdf", diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index 592ceec..4756ffd 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -155,7 +155,12 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): "wg21_paper_tracker.pipeline._download_file", return_value=True ): with patch( - "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None + "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", + "test-bucket", ): - n = run_tracker_pipeline() + with patch( + "wg21_paper_tracker.pipeline._upload_to_gcs", + return_value=True, + ): + n = run_tracker_pipeline() assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 6ec6a00..023f15c 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -79,9 +79,14 @@ def test_get_or_create_paper_calls_author_profile_for_each_author( """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author.""" from unittest.mock import MagicMock - profile = MagicMock() - profile.pk = 1 - mock_profile.return_value = (profile, True) + alice_profile = MagicMock() + alice_profile.pk = 1 + bob_profile = MagicMock() + bob_profile.pk = 2 + mock_profile.side_effect = [ + (alice_profile, True), + (bob_profile, True), + ] mock_get_or_create_paper_author.return_value = (MagicMock(), True) mailing, _ = get_or_create_mailing("2025-01", "Title") @@ -99,8 +104,8 @@ def test_get_or_create_paper_calls_author_profile_for_each_author( mock_profile.assert_any_call("Alice", email=None) mock_profile.assert_any_call("Bob", email=None) assert mock_get_or_create_paper_author.call_count == 2 - mock_get_or_create_paper_author.assert_any_call(paper, profile, 1) - mock_get_or_create_paper_author.assert_any_call(paper, profile, 2) + mock_get_or_create_paper_author.assert_any_call(paper, alice_profile, 1) + mock_get_or_create_paper_author.assert_any_call(paper, bob_profile, 2) @pytest.mark.django_db @@ -151,7 +156,7 @@ def test_get_or_create_paper_gets_existing_and_updates(db): @pytest.mark.django_db def test_get_or_create_paper_year_none_stored_as_zero(db): - """get_or_create_paper with year=None stores 0 for unknown year.""" + """get_or_create_paper with year=None stores 0 so records can be updated later.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="n5034", From e3e91c85b9d550e0e548f842628bafcfd27a6b70 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 16:52:31 -0700 Subject: [PATCH 06/76] =?UTF-8?q?Fix:=20WG21=20=E2=80=93=20optional=20Clou?= =?UTF-8?q?d=20Run,=20per-blob=20isolation,=20PDF=20priority,=20year=3D0?= =?UTF-8?q?=20promotion,=20logging=20#24?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/settings.py | 5 +- docs/operations/WG21_Cloud_Run.md | 3 +- wg21_paper_tracker/cloud_run_job/main.py | 56 +++++++------ wg21_paper_tracker/fetcher.py | 4 +- .../commands/run_wg21_paper_tracker.py | 19 +++-- wg21_paper_tracker/pipeline.py | 8 +- wg21_paper_tracker/services.py | 82 ++++++++++++++----- 7 files changed, 112 insertions(+), 65 deletions(-) diff --git a/config/settings.py b/config/settings.py index 925ebe5..17f549a 100644 --- a/config/settings.py +++ b/config/settings.py @@ -220,9 +220,8 @@ WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = ( - env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "" -).strip() +WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="") or "").strip() +WG21_CLOUD_RUN_ENABLED = env.bool("WG21_CLOUD_RUN_ENABLED", default=False) # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index 7840bd1..e3b1338 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -11,7 +11,8 @@ Create a GCS bucket (e.g., `wg21-data-collector`). Ensure your Django app has the following environment variables configured: - `WG21_GCS_BUCKET`: The name of the GCS bucket. - `GCP_PROJECT_ID`: Your GCP project ID. -- `WG21_CLOUD_RUN_JOB_NAME`: (Optional, defaults to `wg21-convert`) The name of the deployed Cloud Run job. +- `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. +- `WG21_CLOUD_RUN_ENABLED`: (Optional, default `false`) Set to `true` to allow the tracker to trigger the Cloud Run conversion job when new papers are uploaded. Keeps the trigger optional even when project and bucket are set. - `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. ## 2. Build and Push the Docker Image diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index e1a0153..e2f9781 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -11,7 +11,8 @@ from converters.openai_converter import convert_with_openai logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) @@ -73,7 +74,7 @@ def main(): client = storage.Client() bucket = client.bucket(bucket_name) - raw_prefix = "raw/wg21_papers/" + raw_prefix = "raw/wg21_paper_tracker/" converted_prefix = "converted/wg21_papers/" blobs = client.list_blobs(bucket, prefix=raw_prefix) @@ -83,31 +84,34 @@ def main(): if not blob.name.lower().endswith(".pdf"): continue - # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix) :] - md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" - md_blob_name = f"{converted_prefix}{md_relative_path}" - - md_blob = bucket.blob(md_blob_name) - if md_blob.exists(): - logger.info("Skipping %s, MD already exists.", blob.name) - continue - local_pdf_path = Path(tmpdir) / "temp.pdf" - logger.info("Downloading %s to process...", blob.name) - blob.download_to_filename(str(local_pdf_path)) - - logger.info("Converting %s...", blob.name) - md_content = convert_pdf_to_md(local_pdf_path) - - if md_content: - md_blob.upload_from_string(md_content, content_type="text/markdown") - logger.info("Successfully converted and uploaded %s", md_blob_name) - else: - logger.error("Failed to convert %s", blob.name) - - if local_pdf_path.exists(): - local_pdf_path.unlink() + try: + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + relative_path = blob.name[len(raw_prefix) :] + md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" + md_blob_name = f"{converted_prefix}{md_relative_path}" + + md_blob = bucket.blob(md_blob_name) + if md_blob.exists(): + logger.info("Skipping %s, MD already exists.", blob.name) + continue + + logger.info("Downloading %s to process...", blob.name) + blob.download_to_filename(str(local_pdf_path)) + + logger.info("Converting %s...", blob.name) + md_content = convert_pdf_to_md(local_pdf_path) + + if md_content: + md_blob.upload_from_string(md_content, content_type="text/markdown") + logger.info("Successfully converted and uploaded %s", md_blob_name) + else: + logger.error("Failed to convert %s", blob.name) + except Exception: + logger.exception("Failed processing %s", blob.name) + finally: + if local_pdf_path.exists(): + local_pdf_path.unlink() if __name__ == "__main__": diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index 4e44bd0..a5d0cc4 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -30,7 +30,7 @@ def fetch_all_mailings() -> list[dict]: response = requests.get(f"{BASE_URL}/", timeout=30) response.raise_for_status() except requests.RequestException: - logger.error("Failed to fetch WG21 index.") + logger.exception("Failed to fetch WG21 index.") return [] # The mailings are listed in a markdown-like syntax or links @@ -66,7 +66,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: response = requests.get(url, timeout=30) response.raise_for_status() except requests.RequestException: - logger.error("Failed to fetch year page %s.", year) + logger.exception("Failed to fetch year page %s.", year) return [] soup = BeautifulSoup(response.text, "html.parser") diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index bfbb838..b1885af 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -51,8 +51,8 @@ def handle(self, *args, **options): With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0 and GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and - WG21_GCS_BUCKET are set (trigger is skipped if GCS upload is disabled). + total_new_papers > 0, WG21_CLOUD_RUN_ENABLED is True, and + GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and WG21_GCS_BUCKET are set. """ dry_run = options.get("dry_run", False) if dry_run: @@ -70,19 +70,24 @@ def handle(self, *args, **options): location = getattr(settings, "GCP_LOCATION", "us-central1") job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) bucket = getattr(settings, "WG21_GCS_BUCKET", None) + cloud_run_enabled = getattr(settings, "WG21_CLOUD_RUN_ENABLED", False) - if project_id and job_name and bucket: + if project_id and job_name and bucket and cloud_run_enabled: try: trigger_cloud_run_job(project_id, location, job_name) logger.info( "Successfully triggered Cloud Run job %s.", job_name ) - except Exception as e: - logger.error("Failed to trigger Cloud Run job: %s", e) + except Exception: + logger.exception( + "Failed to trigger Cloud Run job %s.", job_name + ) + raise else: logger.warning( - "Skipping Cloud Run trigger because GCP_PROJECT_ID, " - "WG21_CLOUD_RUN_JOB_NAME, or WG21_GCS_BUCKET is not configured." + "Skipping Cloud Run trigger: set WG21_CLOUD_RUN_ENABLED=True " + "and configure GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and " + "WG21_GCS_BUCKET to enable." ) else: logger.info("No new papers found. Skipping Cloud Run job.") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index e0b4f87..8baa910 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -203,7 +203,8 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} + # Prefer PDF (Cloud Run converts PDFs); then html, adoc, ps + priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} return priorities.get(ext.lower(), 100) raw_dir = get_raw_dir(mailing_date) @@ -219,9 +220,8 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue - # Pick the best format - p_list.sort(key=lambda x: format_priority(x["type"])) - best_paper = p_list[0] + # Pick the best format (PDF first for conversion) + best_paper = min(p_list, key=lambda x: format_priority(x["type"])) raw_filename = (best_paper.get("filename") or "").strip() filename = Path(raw_filename).name diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 4328711..f773b75 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -15,6 +15,16 @@ from cppa_user_tracker.models import WG21PaperAuthorProfile +def _normalize_year(year: int | str | None) -> int: + """Return a 4-digit year as int, or 0 if missing/invalid.""" + if year is None: + return 0 + if isinstance(year, int): + return year if 0 < year <= 9999 else 0 + s = str(year).strip()[:4] + return int(s) if s.isdigit() else 0 + + @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: mailing, created = WG21Mailing.objects.get_or_create( @@ -39,23 +49,9 @@ def get_or_create_paper( year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() - year_val = 0 - if year: - s = (year if isinstance(year, str) else str(year)).strip()[:4] - if s.isdigit(): - year_val = int(s) - paper, created = WG21Paper.objects.get_or_create( - paper_id=paper_id, - year=year_val, - defaults={ - "url": url, - "title": title, - "document_date": document_date, - "mailing": mailing, - "subgroup": subgroup, - }, - ) - if not created: + year_val = _normalize_year(year) + + def _update_paper(paper: WG21Paper) -> bool: updated = False if paper.url != url: paper.url = url @@ -77,6 +73,52 @@ def get_or_create_paper( updated = True if updated: paper.save() + return updated + + if year_val > 0: + # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year + paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() + if paper: + _update_paper(paper) + created = False + else: + placeholder = WG21Paper.objects.filter(paper_id=paper_id, year=0).first() + if placeholder: + placeholder.url = url + placeholder.title = title + placeholder.document_date = document_date + placeholder.mailing = mailing + placeholder.subgroup = subgroup + placeholder.year = year_val + placeholder.save() + paper = placeholder + created = False + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=year_val, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=0, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + if not created: + _update_paper(paper) if author_names: emails = author_emails or [] @@ -109,11 +151,7 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() - year_val = 0 - if year is not None: - s = (year if isinstance(year, str) else str(year)).strip()[:4] - if s.isdigit(): - year_val = int(s) + year_val = _normalize_year(year) WG21Paper.objects.filter( paper_id=paper_id, year=year_val, From 2159f5348874fc423fec61aaf67d14d25b2475be Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 17:43:13 -0700 Subject: [PATCH 07/76] wg21: fix author_alias migration default, fail job when bucket unset, use raw/wg21_paper_tracker/YYYY// #24 --- ...006_wg21paperauthorprofile_author_alias.py | 2 +- docs/operations/WG21_Cloud_Run.md | 3 ++- wg21_paper_tracker/cloud_run_job/main.py | 4 +-- wg21_paper_tracker/pipeline.py | 9 ++++--- wg21_paper_tracker/tests/test_workspace.py | 25 ++++++++++--------- wg21_paper_tracker/workspace.py | 9 ++++--- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py index 674176a..1660763 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py @@ -14,6 +14,6 @@ class Migration(migrations.Migration): model_name="wg21paperauthorprofile", name="author_alias", field=models.CharField(blank=True, db_index=True, default="", max_length=255), - preserve_default=True, + preserve_default=False, ), ] diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index e3b1338..cabd828 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -9,6 +9,7 @@ The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job vi Create a GCS bucket (e.g., `wg21-data-collector`). Ensure your Django app has the following environment variables configured: + - `WG21_GCS_BUCKET`: The name of the GCS bucket. - `GCP_PROJECT_ID`: Your GCP project ID. - `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. @@ -59,6 +60,6 @@ Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manage 1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. 2. It checks the WG21 site for new mailings. -3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_papers//`. +3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. 4. It calls the Cloud Run API to execute `wg21-convert`. 5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index e2f9781..61c57dc 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -69,7 +69,7 @@ def main(): bucket_name = os.getenv("WG21_GCS_BUCKET") if not bucket_name: logger.error("WG21_GCS_BUCKET env var not set.") - return + raise RuntimeError("WG21_GCS_BUCKET env var not set.") client = storage.Client() bucket = client.bucket(bucket_name) @@ -86,7 +86,7 @@ def main(): local_pdf_path = Path(tmpdir) / "temp.pdf" try: - # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + # e.g. raw/wg21_paper_tracker/2025/2025-02/p0149r1.pdf -> 2025/2025-02/p0149r1.pdf relative_path = blob.name[len(raw_prefix) :] md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" md_blob_name = f"{converted_prefix}{md_relative_path}" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 8baa910..639d400 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -203,11 +203,10 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - # Prefer PDF (Cloud Run converts PDFs); then html, adoc, ps - priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} return priorities.get(ext.lower(), 100) - raw_dir = get_raw_dir(mailing_date) + raw_dir = get_raw_dir(mailing_date, year) skipped_downloaded = 0 for pid, p_list in papers_by_id.items(): @@ -239,7 +238,9 @@ def format_priority(ext: str) -> int: if _download_file(url, local_path): uploaded = False if bucket_name: - gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" + gcs_path = ( + f"raw/wg21_paper_tracker/{year}/{mailing_date}/{filename}" + ) uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) else: logger.warning( diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 8c8365e..4e50899 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -41,7 +41,7 @@ def test_get_workspace_root_calls_get_workspace_path_with_slug(): def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): - """get_raw_dir returns raw/wg21_paper_tracker//.""" + """get_raw_dir returns raw/wg21_paper_tracker//.""" with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: raw_root = mock_workspace_path / "raw_wg21_paper_tracker" raw_root.mkdir(parents=True, exist_ok=True) @@ -49,8 +49,8 @@ def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", "raw/wg21_paper_tracker": raw_root, }[slug] - path = get_raw_dir("2025-01") - assert path == raw_root / "2025-01" + path = get_raw_dir("2025-01", 2025) + assert path == raw_root / "2025" / "2025-01" assert path.is_dir() @@ -62,8 +62,9 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): m.side_effect = lambda slug: ( raw_root if "raw" in slug else (mock_workspace_path / "app") ) - path = get_raw_dir("2026-02") + path = get_raw_dir("2026-02", 2026) assert path.exists() + assert path.parent.name == "2026" assert path.name == "2026-02" @@ -73,8 +74,8 @@ def test_get_raw_dir_idempotent(mock_workspace_path): raw_root = mock_workspace_path / "raw" raw_root.mkdir(parents=True, exist_ok=True) m.side_effect = lambda slug: raw_root - p1 = get_raw_dir("2025-01") - p2 = get_raw_dir("2025-01") + p1 = get_raw_dir("2025-01", 2025) + p2 = get_raw_dir("2025-01", 2025) assert p1 == p2 assert p1.parent == p2.parent @@ -82,14 +83,14 @@ def test_get_raw_dir_idempotent(mock_workspace_path): def test_get_raw_dir_rejects_invalid_mailing_date(): """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.).""" with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("../../tmp") + get_raw_dir("../../tmp", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025") + get_raw_dir("2025", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-1") + get_raw_dir("2025-1", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-13") + get_raw_dir("2025-13", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-00") + get_raw_dir("2025-00", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("") + get_raw_dir("", 2025) diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 89b853b..04542fe 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -17,11 +17,14 @@ def get_workspace_root() -> Path: return get_workspace_path(_APP_SLUG) -def get_raw_dir(mailing_date: str) -> Path: - """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" +def get_raw_dir(mailing_date: str | None, year: int) -> Path: + """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" if not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") raw_root = get_workspace_path(_RAW_APP_SLUG) - path = raw_root / mailing_date + if mailing_date: + path = raw_root / str(year) / mailing_date + else: + path = raw_root / str(year) path.mkdir(parents=True, exist_ok=True) return path From be392a0403d38db9c31d0e8ac635a9ca398b0d9b Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:11:01 -0700 Subject: [PATCH 08/76] wg21: honor settings.RAW_DIR for raw paper storage #24 --- dev-24error: | 0 wg21_paper_tracker/fetcher.py | 49 +++++++++++++++---- .../commands/import_wg21_metadata_from_csv.py | 5 +- wg21_paper_tracker/tests/test_commands.py | 19 +++++++ wg21_paper_tracker/tests/test_fetcher.py | 24 +++++++++ wg21_paper_tracker/tests/test_workspace.py | 28 ++++------- wg21_paper_tracker/workspace.py | 8 ++- 7 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 dev-24error: create mode 100644 wg21_paper_tracker/tests/test_commands.py diff --git a/dev-24error: b/dev-24error: new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index a5d0cc4..2c6ad03 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -5,9 +5,11 @@ import re import urllib.parse +from typing import Optional import requests from bs4 import BeautifulSoup +from bs4.element import Tag import logging @@ -15,6 +17,34 @@ BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" +_MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$") + + +def _find_table_in_section(anchor) -> Optional[Tag]: + """ + Find the first that belongs to the current mailing section. + Stops at the next mailing anchor (id/name matching mailingYYYY-MM) so we + do not attribute another mailing's table to this section. + """ + if not anchor: + return None + anchor_id = anchor.get("id") or anchor.get("name") or "" + if not _MAILING_ANCHOR_RE.match(anchor_id): + return None + for elem in anchor.next_elements: + if not hasattr(elem, "name"): # NavigableString, etc. + continue + if elem is anchor: + continue + if elem.name == "table": + return elem + if not hasattr(elem, "get"): # e.g. NavigableString + continue + next_id = elem.get("id") or elem.get("name") or "" + if next_id and _MAILING_ANCHOR_RE.match(next_id) and next_id != anchor_id: + return None # next section start; no table in this section + return None + def fetch_all_mailings() -> list[dict]: """ @@ -76,7 +106,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: logger.warning("Anchor %s not found on %s", anchor_id, url) return [] - table = anchor.find_next("table") + table = _find_table_in_section(anchor) if not table: logger.warning("No table found after anchor %s", anchor_id) return [] @@ -96,14 +126,15 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: href = link.get("href", "") match = paper_pattern.search(href) if match: - if href.startswith("../"): - paper_url = urllib.parse.urljoin(url, href) - elif href.startswith("/"): - paper_url = urllib.parse.urljoin(BASE_URL, href) - elif not href.startswith("http"): - paper_url = urllib.parse.urljoin(url, href) - else: - paper_url = href + paper_url = urllib.parse.urljoin(url, href) + parsed = urllib.parse.urlparse(paper_url) + base = urllib.parse.urlparse(BASE_URL) + if ( + parsed.scheme not in ("https", "http") + or parsed.netloc != base.netloc + ): + logger.warning("Skipping off-origin paper URL %s", paper_url) + continue paper_id = match.group(1).lower() file_ext = match.group(2).lower() diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index fc45d7f..3d63734 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -12,7 +12,7 @@ import re from pathlib import Path -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from django.db import IntegrityError from django.utils.dateparse import parse_date @@ -118,8 +118,7 @@ def handle(self, *args, **options): dry_run = options["dry_run"] if not csv_path.exists(): - logger.error("File not found: %s", csv_path) - return + raise CommandError(f"File not found: {csv_path}") if dry_run: logger.info("Dry run: no DB writes.") diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py new file mode 100644 index 0000000..f9c9d7d --- /dev/null +++ b/wg21_paper_tracker/tests/test_commands.py @@ -0,0 +1,19 @@ +"""Tests for wg21_paper_tracker management commands.""" + +import pytest +from pathlib import Path + +from django.core.management import call_command +from django.core.management.base import CommandError + + +CMD_NAME = "import_wg21_metadata_from_csv" + + +def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): + """Command raises CommandError when CSV file does not exist.""" + csv_path = tmp_path / "nonexistent.csv" + assert not csv_path.exists() + + with pytest.raises(CommandError, match=r"File not found:"): + call_command(CMD_NAME, f"--csv-file={csv_path}") diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 8b2ffec..3b903fb 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -171,6 +171,30 @@ def test_fetch_papers_for_mailing_returns_empty_when_no_table(): assert result == [] +def test_fetch_papers_for_mailing_does_not_use_next_mailings_table(): + """First mailing with no table returns []; second mailing's table is not used.""" + html = """ + +

2025-02

+

No papers this month.

+

2025-01

+
+ +
p1234r1.pdfPaperA. Author2025-01-10SG1
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + first = fetch_papers_for_mailing("2025", "2025-02") + second = fetch_papers_for_mailing("2025", "2025-01") + assert first == [], "2025-02 has no table; must not attribute 2025-01's table" + assert len(second) == 1 + assert second[0]["paper_id"] == "p1234r1" + + def test_fetch_papers_for_mailing_calls_year_url(): """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 4e50899..09986df 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -41,27 +41,19 @@ def test_get_workspace_root_calls_get_workspace_path_with_slug(): def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): - """get_raw_dir returns raw/wg21_paper_tracker//.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw_wg21_paper_tracker" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: { - "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", - "raw/wg21_paper_tracker": raw_root, - }[slug] + """get_raw_dir returns RAW_DIR/wg21_paper_tracker///.""" + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path path = get_raw_dir("2025-01", 2025) - assert path == raw_root / "2025" / "2025-01" + expected = mock_workspace_path / "wg21_paper_tracker" / "2025" / "2025-01" + assert path == expected assert path.is_dir() def test_get_raw_dir_creates_parents(mock_workspace_path): """get_raw_dir creates parent directories.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw_app" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: ( - raw_root if "raw" in slug else (mock_workspace_path / "app") - ) + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path path = get_raw_dir("2026-02", 2026) assert path.exists() assert path.parent.name == "2026" @@ -70,10 +62,8 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): def test_get_raw_dir_idempotent(mock_workspace_path): """get_raw_dir can be called twice for same mailing_date without error.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: raw_root + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path p1 = get_raw_dir("2025-01", 2025) p2 = get_raw_dir("2025-01", 2025) assert p1 == p2 diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 04542fe..1934ed8 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -6,6 +6,8 @@ import re from pathlib import Path +from django.conf import settings + from config.workspace import get_workspace_path _APP_SLUG = "wg21_paper_tracker" @@ -21,7 +23,11 @@ def get_raw_dir(mailing_date: str | None, year: int) -> Path: """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" if not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") - raw_root = get_workspace_path(_RAW_APP_SLUG) + if getattr(settings, "RAW_DIR", None): + raw_root = Path(settings.RAW_DIR) / _APP_SLUG + else: + raw_root = get_workspace_path(_RAW_APP_SLUG) + raw_root.mkdir(parents=True, exist_ok=True) if mailing_date: path = raw_root / str(year) / mailing_date else: From 005278a156cc35e33e9e7da1f3f2a7de3050b740 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:12:27 -0700 Subject: [PATCH 09/76] Fix: lint/format error #24 --- wg21_paper_tracker/tests/test_commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py index f9c9d7d..34a52e9 100644 --- a/wg21_paper_tracker/tests/test_commands.py +++ b/wg21_paper_tracker/tests/test_commands.py @@ -1,7 +1,6 @@ """Tests for wg21_paper_tracker management commands.""" import pytest -from pathlib import Path from django.core.management import call_command from django.core.management.base import CommandError From 35476526f60d32bb3cbf5b6d3a162b9edaaddc17 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:57:32 -0700 Subject: [PATCH 10/76] fix(openai_converter): use neutral page placeholder for failed pages #24 --- cppa_user_tracker/services.py | 18 ++++++++-- .../converters/openai_converter.py | 7 ++-- .../converters/pdfplumber_converter.py | 2 +- .../commands/import_wg21_metadata_from_csv.py | 2 +- wg21_paper_tracker/pipeline.py | 34 ++++++++++--------- wg21_paper_tracker/tests/test_pipeline.py | 5 ++- wg21_paper_tracker/workspace.py | 2 +- 7 files changed, 43 insertions(+), 27 deletions(-) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 146f778..152bbc2 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -381,11 +381,25 @@ def get_or_create_wg21_paper_author_profile( return profile, True if len(candidates) == 1: - return candidates[0], False + profile = candidates[0] + if email_val and not profile.emails.filter(email=email_val).exists(): + add_email( + profile, + email_val, + is_primary=not profile.emails.filter(is_active=True).exists(), + ) + return profile, False # Two or more: disambiguate by email if provided if email_val: for p in candidates: if p.emails.filter(email=email_val).exists(): return p, False - return candidates[0], False + profile = candidates[0] + if email_val and not profile.emails.filter(email=email_val).exists(): + add_email( + profile, + email_val, + is_primary=not profile.emails.filter(is_active=True).exists(), + ) + return profile, False diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 078e984..ae17f6e 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -234,15 +234,16 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: else: logger.warning(f"Failed to convert page {page_num} with OpenAI") markdown_parts.append( - f"## Page {page_num}\n\n*[Conversion failed for this page]*\n\n" + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) except Exception as e: logger.error( - f"Error processing page {page_num}: {str(e)}", exc_info=True + f"Error processing page {page_num}: {str(e)}", + exc_info=True, ) markdown_parts.append( - f"## Page {page_num}\n\n*[Error processing this page]*\n\n" + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) continue diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 6329c5a..fb36c4e 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -62,7 +62,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: markdown_parts.append( "| " + " | ".join( - str(cell) if cell else "" + "" if cell is None else str(cell) for cell in row ) + " |\n" diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 3d63734..365a008 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -202,7 +202,6 @@ def handle(self, *args, **options): stats["papers_updated"] += 1 except IntegrityError as e: # Duplicate (paper_id, year): fetch existing by same key and update - stats["papers_updated"] += 1 try: lookup_year = year if year is not None else 0 paper = WG21Paper.objects.filter( @@ -220,6 +219,7 @@ def handle(self, *args, **options): if year is not None: paper.year = year paper.save() + stats["papers_updated"] += 1 if author_names: from cppa_user_tracker.services import ( get_or_create_wg21_paper_author_profile, diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 639d400..966782e 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -126,21 +126,23 @@ def run_tracker_pipeline() -> int: # Filter newer mailings new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] - # Also check the latest one again just in case new papers were added - if latest_mailing and latest_mailing.mailing_date not in [ - m["mailing_date"] for m in new_mailings - ]: - # We re-check the most recent mailing from the DB to catch late additions - # Find the matching dict from all_mailings - current_m = next( - ( - m - for m in all_mailings - if m["mailing_date"] == latest_mailing.mailing_date - ), - None, + # Requeue incomplete mailings so transient failures get retried (not just the latest) + retry_dates = set( + WG21Mailing.objects.filter(papers__isnull=True).values_list( + "mailing_date", flat=True ) - if current_m: + ) + retry_dates.update( + WG21Mailing.objects.filter(papers__is_downloaded=False).values_list( + "mailing_date", flat=True + ) + ) + if latest_mailing: + retry_dates.add(latest_mailing.mailing_date) + for current_m in all_mailings: + if current_m["mailing_date"] in retry_dates and current_m[ + "mailing_date" + ] not in [x["mailing_date"] for x in new_mailings]: new_mailings.append(current_m) # Sort chronologically (oldest to newest) @@ -194,7 +196,7 @@ def run_tracker_pipeline() -> int: ) continue - # Group papers by ID to prioritize PDF over HTML (paper_id is case-insensitive) + # Group papers by ID so we can choose the preferred source format per paper. papers_by_id = {} for p in papers: pid = (p["paper_id"] or "").strip().lower() @@ -219,7 +221,7 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue - # Pick the best format (PDF first for conversion) + # Pick the preferred format: adoc > html > ps > pdf. best_paper = min(p_list, key=lambda x: format_priority(x["type"])) raw_filename = (best_paper.get("filename") or "").strip() diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index 4756ffd..ad56a29 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,6 +1,5 @@ """Tests for wg21_paper_tracker.pipeline.""" -from pathlib import Path from unittest.mock import patch, MagicMock import pytest @@ -47,10 +46,10 @@ def test_download_file_success_binary(tmp_path): assert filepath.read_bytes() == b"\x25\x50\x44\x46" -def test_download_file_uses_timeout(): +def test_download_file_uses_timeout(tmp_path): """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" url = "https://example.com/f" - filepath = Path("/tmp/out") + filepath = tmp_path / "out" resp = MagicMock() resp.raise_for_status = MagicMock() resp.headers = {"content-type": "text/plain"} diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 1934ed8..62ec55e 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -21,7 +21,7 @@ def get_workspace_root() -> Path: def get_raw_dir(mailing_date: str | None, year: int) -> Path: """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" - if not _MAILING_DATE_RE.fullmatch(mailing_date): + if mailing_date is not None and not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") if getattr(settings, "RAW_DIR", None): raw_root = Path(settings.RAW_DIR) / _APP_SLUG From 7403033971fcccf8b7c33d7b51626bf49af053e6 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 21:01:37 -0700 Subject: [PATCH 11/76] Fix: doc and converter fixes #24 --- docs/operations/WG21_Cloud_Run.md | 6 +- docs/service_api/cppa_user_tracker.md | 2 +- .../converters/openai_converter.py | 104 +++++++++++------- wg21_paper_tracker/pipeline.py | 9 +- 4 files changed, 74 insertions(+), 47 deletions(-) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index cabd828..b1caccf 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -2,7 +2,7 @@ The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. -The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job via the Google Cloud Run API when new papers are downloaded. +When `WG21_CLOUD_RUN_ENABLED=true` and `WG21_CLOUD_RUN_JOB_NAME` is set, the Django tracker (`run_wg21_paper_tracker`) triggers the configured Cloud Run job after uploading new papers. ## 1. Setup Google Cloud Storage @@ -61,5 +61,5 @@ Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manage 1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. 2. It checks the WG21 site for new mailings. 3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. -4. It calls the Cloud Run API to execute `wg21-convert`. -5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. +4. If Cloud Run triggering is enabled, it calls the configured Cloud Run job. +5. The Cloud Run Job then spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index 4ca0adb..bc89dbd 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -45,7 +45,7 @@ | Function | Parameter types | Return type | Description | | -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name. If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist, and email is provided, returns the one with that email if any; otherwise returns the first. Use this when linking paper authors so that same name + same email link to the same profile. | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, or multiple exist and one matches the email, returns that profile; otherwise returns the first. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | --- diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index ae17f6e..8952f15 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -8,6 +8,8 @@ import io import logging import os +import shutil +import tempfile from pathlib import Path from typing import Optional @@ -34,32 +36,46 @@ ) -def pdf_to_images(pdf_path: Path) -> list[Image.Image]: +def pdf_to_images(pdf_path: Path) -> tuple[Optional[Path], list[Path]]: """ - Convert PDF pages to images. + Convert PDF pages to image files on disk (one per page) to avoid loading all into memory. - Note: pdf2image should automatically handle PDF rotation metadata, - but we also apply additional rotation correction in correct_image_rotation(). + Writes images into a temporary directory and returns (tmp_dir, paths). Caller must process + each path and then remove tmp_dir (e.g. shutil.rmtree) so only the current page is resident. + + Note: pdf2image should automatically handle PDF rotation metadata; we also apply + additional rotation correction in correct_image_rotation() when loading each image. Args: pdf_path: Path to the PDF file. Returns: - List of PIL Image objects. + (tmp_dir, list of image paths). tmp_dir is None on failure or if pdf2image unavailable; + paths are in page order. Caller must cleanup tmp_dir when not None. """ if not PDF2IMAGE_AVAILABLE: logger.error("pdf2image is not available") - return [] + return (None, []) try: logger.info(f"Converting PDF to images: {pdf_path.name}") - # pdf2image should respect PDF rotation, but we'll also check EXIF data - images = convert_from_path(pdf_path, dpi=200) - logger.info(f"Converted {len(images)} pages to images") - return images + tmp_dir = Path(tempfile.mkdtemp(prefix="wg21_pdf_")) + try: + path_strs = convert_from_path( + pdf_path, + dpi=200, + paths_only=True, + output_folder=str(tmp_dir), + ) + paths = [Path(p) for p in path_strs] + logger.info(f"Converted {len(paths)} pages to images") + return (tmp_dir, paths) + except Exception: + shutil.rmtree(tmp_dir, ignore_errors=True) + raise except Exception as e: logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) - return [] + return (None, []) def correct_image_rotation(image: Image.Image) -> Image.Image: @@ -206,46 +222,50 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: try: logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - # Convert PDF to images - images = pdf_to_images(pdf_path) - if not images: + # Convert PDF to image files on disk (avoids loading all pages into memory) + tmp_dir, paths = pdf_to_images(pdf_path) + if not paths: logger.error(f"Failed to convert PDF to images: {pdf_path.name}") return None - total_pages = len(images) + total_pages = len(paths) markdown_parts = [] successful_pages = 0 - # Process each page - for page_num, image in enumerate(images, 1): - try: - # Convert image to base64 - image_base64 = image_to_base64(image) - - # Convert page with OpenAI - page_markdown = convert_page_with_openai( - image_base64, page_num, total_pages - ) - - if page_markdown: - markdown_parts.append(page_markdown) - markdown_parts.append("\n\n") - successful_pages += 1 - else: - logger.warning(f"Failed to convert page {page_num} with OpenAI") + try: + # Process each page: load one image at a time, convert, then move on + for page_num, image_path in enumerate(paths, 1): + try: + with Image.open(image_path) as img: + img.load() + image_base64 = image_to_base64(img) + # Convert page with OpenAI + page_markdown = convert_page_with_openai( + image_base64, page_num, total_pages + ) + + if page_markdown: + markdown_parts.append(page_markdown) + markdown_parts.append("\n\n") + successful_pages += 1 + else: + logger.warning(f"Failed to convert page {page_num} with OpenAI") + markdown_parts.append( + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" + ) + + except Exception as e: + logger.error( + f"Error processing page {page_num}: {str(e)}", + exc_info=True, + ) markdown_parts.append( f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) - - except Exception as e: - logger.error( - f"Error processing page {page_num}: {str(e)}", - exc_info=True, - ) - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - continue + continue + finally: + if tmp_dir is not None: + shutil.rmtree(tmp_dir, ignore_errors=True) markdown_content = "".join(markdown_parts) diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 966782e..d7f96be 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -199,7 +199,14 @@ def run_tracker_pipeline() -> int: # Group papers by ID so we can choose the preferred source format per paper. papers_by_id = {} for p in papers: - pid = (p["paper_id"] or "").strip().lower() + pid = (p.get("paper_id") or "").strip().lower() + if not pid: + logger.warning( + "Skipping paper entry without a paper_id in mailing %s: %r", + mailing_date, + p, + ) + continue if pid not in papers_by_id: papers_by_id[pid] = [] papers_by_id[pid].append(p) From c33c475dd94253a9c7c2692cc0db83685ecc11c4 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 21:38:54 -0700 Subject: [PATCH 12/76] Fix: default sqlite, document #24 --- .github/workflows/actions.yml | 1 + config/test_settings.py | 10 +++-- requirements.txt | 1 - wg21_paper_tracker/admin.py | 4 +- wg21_paper_tracker/pipeline.py | 55 +++++++++++------------ wg21_paper_tracker/services.py | 2 + wg21_paper_tracker/tests/test_services.py | 7 +++ 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 11e32e5..f9ffb47 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -84,6 +84,7 @@ jobs: - name: Test with pytest env: DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres + TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres SECRET_KEY: for-testing-only DJANGO_SETTINGS_MODULE: config.test_settings run: | diff --git a/config/test_settings.py b/config/test_settings.py index 3c2e1db..2649ac5 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -7,10 +7,14 @@ from pathlib import Path from .settings import * # noqa: F401, F403 +from .settings import env -# Use SQLite in-memory for speed when DATABASE_URL not set (e.g. local pytest). -# CI can set DATABASE_URL=sqlite:///test.sqlite3 or leave unset for :memory: -if not os.environ.get("DATABASE_URL", "").strip(): +# Use SQLite in-memory for tests by default so no PostgreSQL is required. +# Set TEST_DATABASE_URL to run tests against PostgreSQL (e.g. in CI). +_test_db_url = os.environ.get("TEST_DATABASE_URL", "").strip() +if _test_db_url: + DATABASES = {"default": env.db("TEST_DATABASE_URL")} +else: DATABASES = { "default": { "ENGINE": "django.db.backends.sqlite3", diff --git a/requirements.txt b/requirements.txt index 289d486..a94ab8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,5 @@ selenium>=4.35 # wg21_paper_tracker app beautifulsoup4>=4.12.0 -lxml>=5.0.0 google-cloud-run>=0.10.1 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py index 86784ae..bd57f4c 100644 --- a/wg21_paper_tracker/admin.py +++ b/wg21_paper_tracker/admin.py @@ -13,6 +13,7 @@ class WG21PaperAuthorInline(admin.TabularInline): model = WG21PaperAuthor extra = 1 raw_id_fields = ("profile",) + ordering = ("author_order", "id") @admin.register(WG21Paper) @@ -34,6 +35,7 @@ class WG21PaperAdmin(admin.ModelAdmin): @admin.register(WG21PaperAuthor) class WG21PaperAuthorAdmin(admin.ModelAdmin): - list_display = ("paper", "profile", "created_at") + list_display = ("paper", "profile", "author_order", "created_at") search_fields = ("paper__paper_id", "profile__display_name") raw_id_fields = ("paper", "profile") + ordering = ("paper", "author_order", "id") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index d7f96be..9ea7550 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -243,6 +243,33 @@ def format_priority(ext: str) -> int: local_path = raw_dir / filename url = best_paper["url"] + # Persist paper row before transfer so failed downloads remain retry candidates + doc_date_str = best_paper.get("document_date") + from django.utils.dateparse import parse_date + + doc_date = None + if doc_date_str: + try: + doc_date = parse_date(doc_date_str) + except Exception as e: + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + doc_date = None + + paper_obj, _created = get_or_create_paper( + paper_id=pid, + url=url, + title=best_paper["title"], + document_date=doc_date, + mailing=mailing_obj, + subgroup=best_paper["subgroup"], + author_names=best_paper["authors"], + year=year, + ) + # Download if _download_file(url, local_path): uploaded = False @@ -257,34 +284,6 @@ def format_priority(ext: str) -> int: pid, ) - # Persist DB - doc_date_str = best_paper["document_date"] - # Parse date if available - from django.utils.dateparse import parse_date - - doc_date = None - if doc_date_str: - try: - doc_date = parse_date(doc_date_str) - except Exception as e: - logger.warning( - "Failed to parse document date: %s: %s", - doc_date_str, - e, - ) - doc_date = None - - paper_obj, _created = get_or_create_paper( - paper_id=pid, - url=url, - title=best_paper["title"], - document_date=doc_date, - mailing=mailing_obj, - subgroup=best_paper["subgroup"], - author_names=best_paper["authors"], - year=year, - ) - if uploaded: paper_obj.is_downloaded = True paper_obj.save(update_fields=["is_downloaded"]) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index f773b75..8030424 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -151,6 +151,8 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() + if year is None: + raise ValueError("year is required; pass 0 explicitly for placeholder papers") year_val = _normalize_year(year) WG21Paper.objects.filter( paper_id=paper_id, diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 023f15c..bd3a3e2 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -218,6 +218,13 @@ def test_get_or_create_paper_sets_author_order(db): # --- mark_paper_downloaded --- +@pytest.mark.django_db +def test_mark_paper_downloaded_requires_year(db): + """mark_paper_downloaded raises ValueError when year is omitted.""" + with pytest.raises(ValueError, match="year is required"): + mark_paper_downloaded("p1000r0") + + @pytest.mark.django_db def test_mark_paper_downloaded_sets_flag(db): """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year).""" From 93ee8b7e3906ddf522134d074ded4caf6eaa3f31 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 08:18:51 -0700 Subject: [PATCH 13/76] Fix: author profile merge avoidance, blank paper_id rejection, mailing race recovery, logging and Ruf #24 --- cppa_user_tracker/services.py | 12 ++++------ cppa_user_tracker/tests/test_services.py | 23 +++++++++++++++---- .../converters/docling_converter.py | 6 ++--- .../converters/openai_converter.py | 19 ++++++++------- .../commands/import_wg21_metadata_from_csv.py | 9 ++++---- wg21_paper_tracker/pipeline.py | 2 +- wg21_paper_tracker/services.py | 2 ++ 7 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 152bbc2..34da007 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -395,11 +395,7 @@ def get_or_create_wg21_paper_author_profile( for p in candidates: if p.emails.filter(email=email_val).exists(): return p, False - profile = candidates[0] - if email_val and not profile.emails.filter(email=email_val).exists(): - add_email( - profile, - email_val, - is_primary=not profile.emails.filter(is_active=True).exists(), - ) - return profile, False + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) + add_email(profile, email_val, is_primary=True) + return profile, True + return candidates[0], False diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index 6d4d85b..75775ed 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -608,6 +608,19 @@ def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): assert profile.id == existing.id +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_adds_email(): + """Existing single match gets the supplied email attached.""" + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author", + email="solo@example.com", + ) + assert created is False + assert profile.id == existing.id + assert profile.emails.filter(email="solo@example.com").exists() + + @pytest.mark.django_db def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first(): """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email.""" @@ -635,8 +648,8 @@ def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_se @pytest.mark.django_db -def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_returns_first(): - """get_or_create_wg21_paper_author_profile returns first when email provided but no match.""" +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_creates_new_profile(): + """When multiple match and email matches none, a new profile is created with that email.""" first = WG21PaperAuthorProfile.objects.create(display_name="Other Name") second = WG21PaperAuthorProfile.objects.create(display_name="Other Name") services.add_email(second, "other@example.com", is_primary=True) @@ -644,5 +657,7 @@ def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_no display_name="Other Name", email="nomatch@example.com", ) - assert created is False - assert profile.id == first.id + assert created is True + assert profile.id not in (first.id, second.id) + assert profile.display_name == "Other Name" + assert profile.emails.filter(email="nomatch@example.com").exists() diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py index b9d6067..7e73753 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -56,8 +56,6 @@ def convert_with_docling(pdf_path: Path) -> Optional[str]: ) return None - except Exception as e: - logger.error( - f"Docling conversion failed for {pdf_path.name}: {str(e)}", exc_info=True - ) + except Exception: + logger.error(f"Docling conversion failed for {pdf_path.name}", exc_info=True) return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 8952f15..7edd988 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -222,17 +222,16 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: try: logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - # Convert PDF to image files on disk (avoids loading all pages into memory) - tmp_dir, paths = pdf_to_images(pdf_path) - if not paths: - logger.error(f"Failed to convert PDF to images: {pdf_path.name}") - return None - - total_pages = len(paths) - markdown_parts = [] - successful_pages = 0 - try: + # Convert PDF to image files on disk (avoids loading all pages into memory) + tmp_dir, paths = pdf_to_images(pdf_path) + if not paths: + logger.error(f"Failed to convert PDF to images: {pdf_path.name}") + return None + + total_pages = len(paths) + markdown_parts = [] + successful_pages = 0 # Process each page: load one image at a time, convert, then move on for page_num, image_path in enumerate(paths, 1): try: diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 365a008..e00d64b 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -201,6 +201,8 @@ def handle(self, *args, **options): else: stats["papers_updated"] += 1 except IntegrityError as e: + # Re-resolve mailing (IntegrityError may have come from get_or_create_mailing race) + mailing, _ = get_or_create_mailing(mailing_date, mailing_title) # Duplicate (paper_id, year): fetch existing by same key and update try: lookup_year = year if year is not None else 0 @@ -230,12 +232,11 @@ def handle(self, *args, **options): name ) get_or_create_paper_author(paper, profile, i + 1) - except Exception as inner: + except Exception: stats["skipped"] += 1 - logger.error( - "Error for paper_id=%s (after IntegrityError): %s", + logger.exception( + "Error for paper_id=%s (after IntegrityError).", paper_id, - inner, ) except Exception as e: stats["skipped"] += 1 diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 9ea7550..894cc18 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -286,7 +286,7 @@ def format_priority(ext: str) -> int: if uploaded: paper_obj.is_downloaded = True - paper_obj.save(update_fields=["is_downloaded"]) + paper_obj.save(update_fields=["is_downloaded", "updated_at"]) total_new_papers += 1 # Clean up local file to save space diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 8030424..37bd91d 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -49,6 +49,8 @@ def get_or_create_paper( year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") year_val = _normalize_year(year) def _update_paper(paper: WG21Paper) -> bool: From 61a6c7f1bcdd089c24632ce21b80652438abbe51 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 09:15:03 -0700 Subject: [PATCH 14/76] Fix: author profile merge avoidance, blank paper_id rejection, pipeline validation, mailing race recovery, logging, and API docs #24 --- docs/service_api/cppa_user_tracker.md | 2 +- wg21_paper_tracker/pipeline.py | 62 ++++++++++++++++++++++++--- wg21_paper_tracker/services.py | 2 + 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index bc89dbd..8f50642 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -45,7 +45,7 @@ | Function | Parameter types | Return type | Description | | -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, or multiple exist and one matches the email, returns that profile; otherwise returns the first. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist and one matches the email, returns that profile. If multiple exist and no email is provided, returns the first. If multiple exist and the supplied email matches none, creates a new profile with that email. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | --- diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 894cc18..516837e 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -228,8 +228,61 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue + # Filter to entries with required keys and valid types; skip malformed. + valid_list = [] + for p in p_list: + type_val = ( + (p.get("type") or "").strip() + if isinstance(p.get("type"), str) + else "" + ) + url_val = ( + (p.get("url") or "").strip() + if isinstance(p.get("url"), str) + else "" + ) + title_val = ( + (p.get("title") or "").strip() + if isinstance(p.get("title"), str) + else "" + ) + if not type_val or not url_val or not title_val: + logger.debug( + "Skipping malformed paper entry for %s in mailing %s: %r", + pid, + mailing_date, + p, + ) + continue + valid_list.append(p) + + if not valid_list: + logger.warning( + "Skipping paper %s in mailing %s: no valid entries (type, url, title)", + pid, + mailing_date, + ) + continue + # Pick the preferred format: adoc > html > ps > pdf. - best_paper = min(p_list, key=lambda x: format_priority(x["type"])) + best_paper = min( + valid_list, + key=lambda x: format_priority(str(x.get("type") or "").strip()), + ) + url = (best_paper.get("url") or "").strip() + title = (best_paper.get("title") or "").strip() + subgroup = (best_paper.get("subgroup") or "").strip() + authors = best_paper.get("authors") + if not isinstance(authors, list): + authors = [] + if not url or not title: + logger.warning( + "Skipping paper %s in mailing %s due to missing required fields: %r", + pid, + mailing_date, + best_paper, + ) + continue raw_filename = (best_paper.get("filename") or "").strip() filename = Path(raw_filename).name @@ -241,7 +294,6 @@ def format_priority(ext: str) -> int: ) continue local_path = raw_dir / filename - url = best_paper["url"] # Persist paper row before transfer so failed downloads remain retry candidates doc_date_str = best_paper.get("document_date") @@ -262,11 +314,11 @@ def format_priority(ext: str) -> int: paper_obj, _created = get_or_create_paper( paper_id=pid, url=url, - title=best_paper["title"], + title=title, document_date=doc_date, mailing=mailing_obj, - subgroup=best_paper["subgroup"], - author_names=best_paper["authors"], + subgroup=subgroup, + author_names=authors, year=year, ) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 37bd91d..b0be567 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -153,6 +153,8 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") if year is None: raise ValueError("year is required; pass 0 explicitly for placeholder papers") year_val = _normalize_year(year) From c9023f5e02ddc28faeb92a6221bb9fc755c6749d Mon Sep 17 00:00:00 2001 From: zho Date: Thu, 12 Mar 2026 00:40:51 +0800 Subject: [PATCH 15/76] #38-add youtube tracker. confirmed that transcripts are downloaded. --- .env.example | 13 + config/settings.py | 12 + ...5_youtubespeaker_alter_baseprofile_type.py | 48 ++ .../0006_alter_slackuser_slack_user_id.py | 18 + cppa_user_tracker/models.py | 13 + cppa_user_tracker/services.py | 20 + cppa_youtube_script_tracker/__init__.py | 1 + cppa_youtube_script_tracker/admin.py | 46 ++ cppa_youtube_script_tracker/apps.py | 7 + cppa_youtube_script_tracker/fetcher.py | 244 +++++++++ .../management/__init__.py | 1 + .../management/commands/__init__.py | 1 + .../run_cppa_youtube_script_tracker.py | 492 ++++++++++++++++++ .../migrations/0001_initial.py | 176 +++++++ .../migrations/__init__.py | 1 + cppa_youtube_script_tracker/models.py | 145 ++++++ cppa_youtube_script_tracker/preprocessor.py | 166 ++++++ cppa_youtube_script_tracker/services.py | 131 +++++ cppa_youtube_script_tracker/tests/__init__.py | 1 + cppa_youtube_script_tracker/transcript.py | 87 ++++ cppa_youtube_script_tracker/workspace.py | 85 +++ docs/Schema.md | 184 +++++-- docs/service_api/README.md | 2 + .../cppa_youtube_script_tracker.md | 144 +++++ requirements.txt | 4 + .../management/commands/run_all_collectors.py | 1 + 26 files changed, 2004 insertions(+), 39 deletions(-) create mode 100644 cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py create mode 100644 cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py create mode 100644 cppa_youtube_script_tracker/__init__.py create mode 100644 cppa_youtube_script_tracker/admin.py create mode 100644 cppa_youtube_script_tracker/apps.py create mode 100644 cppa_youtube_script_tracker/fetcher.py create mode 100644 cppa_youtube_script_tracker/management/__init__.py create mode 100644 cppa_youtube_script_tracker/management/commands/__init__.py create mode 100644 cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py create mode 100644 cppa_youtube_script_tracker/migrations/0001_initial.py create mode 100644 cppa_youtube_script_tracker/migrations/__init__.py create mode 100644 cppa_youtube_script_tracker/models.py create mode 100644 cppa_youtube_script_tracker/preprocessor.py create mode 100644 cppa_youtube_script_tracker/services.py create mode 100644 cppa_youtube_script_tracker/tests/__init__.py create mode 100644 cppa_youtube_script_tracker/transcript.py create mode 100644 cppa_youtube_script_tracker/workspace.py create mode 100644 docs/service_api/cppa_youtube_script_tracker.md diff --git a/.env.example b/.env.example index 5bc2ec2..008958a 100644 --- a/.env.example +++ b/.env.example @@ -146,3 +146,16 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Path to context repository (where markdown files are exported) # DISCORD_CONTEXT_REPO_PATH=F:\boost\discord-cplusplus-together-context + +# ============================================================================= +# YouTube (cppa_youtube_script_tracker) +# ============================================================================= +# YouTube Data API v3 key (console.cloud.google.com → APIs & Services → Credentials) +# YOUTUBE_API_KEY=wZLObIrVHpcPZGj60P8GXA... + +# Pinecone namespace for YouTube video/transcript sync (default: youtube-scripts) +# YOUTUBE_PINECONE_NAMESPACE=youtube-scripts + +# Earliest published_at to use when DB is empty (ISO 8601, e.g. 2015-01-01T00:00:00Z) +# YOUTUBE_DEFAULT_PUBLISHED_AFTER=2015-01-01T00:00:00Z + diff --git a/config/settings.py b/config/settings.py index d45b438..8c9942f 100644 --- a/config/settings.py +++ b/config/settings.py @@ -48,6 +48,7 @@ "cppa_slack_transcript_tracker", "cppa_slack_tracker", "discord_activity_tracker", + "cppa_youtube_script_tracker", ] MIDDLEWARE = [ @@ -140,6 +141,7 @@ "cppa_slack_tracker", "discord_activity_tracker", "boost_mailing_list_tracker", + "cppa_youtube_script_tracker", "shared", ) WORKSPACE_DIR.mkdir(parents=True, exist_ok=True) @@ -307,3 +309,13 @@ "level": "ERROR", } LOGGING["root"]["handlers"].append("slack") + +# YouTube (cppa_youtube_script_tracker) +YOUTUBE_API_KEY = (env("YOUTUBE_API_KEY", default="") or "").strip() +YOUTUBE_PINECONE_NAMESPACE = ( + env("YOUTUBE_PINECONE_NAMESPACE", default="youtube-scripts") or "youtube-scripts" +).strip() +YOUTUBE_DEFAULT_PUBLISHED_AFTER = ( + env("YOUTUBE_DEFAULT_PUBLISHED_AFTER", default="") or "" +).strip() + diff --git a/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py new file mode 100644 index 0000000..0286793 --- /dev/null +++ b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py @@ -0,0 +1,48 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="YoutubeSpeaker", + fields=[ + ( + "baseprofile_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="cppa_user_tracker.baseprofile", + ), + ), + ("display_name", models.CharField(db_index=True, max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + bases=("cppa_user_tracker.baseprofile",), + ), + migrations.AlterField( + model_name="baseprofile", + name="type", + field=models.CharField( + choices=[ + ("github", "GitHub"), + ("slack", "Slack"), + ("mailing_list", "Mailing list"), + ("wg21", "WG21"), + ("discord", "Discord"), + ("youtube", "YouTube"), + ], + db_index=True, + max_length=20, + ), + ), + ] diff --git a/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py new file mode 100644 index 0000000..721c9a7 --- /dev/null +++ b/cppa_user_tracker/migrations/0006_alter_slackuser_slack_user_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.28 on 2026-03-11 01:57 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_youtubespeaker_alter_baseprofile_type"), + ] + + operations = [ + migrations.AlterField( + model_name="slackuser", + name="slack_user_id", + field=models.CharField(max_length=64, unique=True), + ), + ] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 46be627..3e3da6e 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -11,6 +11,7 @@ class ProfileType(models.TextChoices): MAILING_LIST = "mailing_list", "Mailing list" WG21 = "wg21", "WG21" DISCORD = "discord", "Discord" + YOUTUBE = "youtube", "YouTube" class GitHubAccountType(models.TextChoices): @@ -183,3 +184,15 @@ def save(self, *args, **kwargs): is_bot = models.BooleanField(default=False) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) + + +class YoutubeSpeaker(BaseProfile): + """YouTube speaker profile. Identified by display_name.""" + + def save(self, *args, **kwargs): + self.type = ProfileType.YOUTUBE + super().save(*args, **kwargs) + + display_name = models.CharField(max_length=255, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index a583894..0e420e1 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -26,6 +26,7 @@ MailingListProfile, SlackUser, DiscordProfile, + YoutubeSpeaker, ) @@ -350,3 +351,22 @@ def get_or_create_discord_profile( profile.is_bot = is_bot profile.save() return profile, created + + +def get_or_create_youtube_speaker( + display_name: str, + identity: Optional[Identity] = None, +) -> tuple[YoutubeSpeaker, bool]: + """Get or create a YoutubeSpeaker by display_name. Returns (speaker, created). + + Looks up by display_name. On creation, sets identity if provided. + Raises ValueError if display_name is empty. + """ + display_name_val = (display_name or "").strip() + if not display_name_val: + raise ValueError("display_name must not be empty.") + speaker, created = YoutubeSpeaker.objects.get_or_create( + display_name=display_name_val, + defaults={"identity": identity}, + ) + return speaker, created diff --git a/cppa_youtube_script_tracker/__init__.py b/cppa_youtube_script_tracker/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cppa_youtube_script_tracker/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/admin.py b/cppa_youtube_script_tracker/admin.py new file mode 100644 index 0000000..e96301e --- /dev/null +++ b/cppa_youtube_script_tracker/admin.py @@ -0,0 +1,46 @@ +from django.contrib import admin +from django.contrib.admin import ModelAdmin + +from .models import CppaTags, YouTubeChannel, YouTubeVideo, YouTubeVideoSpeaker, YouTubeVideoTags + + +@admin.register(YouTubeChannel) +class YouTubeChannelAdmin(ModelAdmin): + list_display = ("channel_id", "channel_title", "created_at") + search_fields = ("channel_id", "channel_title") + + +@admin.register(YouTubeVideo) +class YouTubeVideoAdmin(ModelAdmin): + list_display = ( + "video_id", + "title", + "channel", + "published_at", + "has_transcript", + "created_at", + ) + list_filter = ("has_transcript", "channel", "published_at") + search_fields = ("video_id", "title", "description", "search_term") + raw_id_fields = ("channel",) + date_hierarchy = "published_at" + + +@admin.register(YouTubeVideoSpeaker) +class YouTubeVideoSpeakerAdmin(ModelAdmin): + list_display = ("id", "video", "speaker", "created_at") + raw_id_fields = ("video", "speaker") + search_fields = ("video__video_id", "video__title", "speaker__display_name") + + +@admin.register(CppaTags) +class CppaTagsAdmin(ModelAdmin): + list_display = ("id", "tag_name") + search_fields = ("tag_name",) + + +@admin.register(YouTubeVideoTags) +class YouTubeVideoTagsAdmin(ModelAdmin): + list_display = ("id", "youtube_video", "cppa_tag") + raw_id_fields = ("youtube_video", "cppa_tag") + search_fields = ("youtube_video__video_id", "youtube_video__title", "cppa_tag__tag_name") diff --git a/cppa_youtube_script_tracker/apps.py b/cppa_youtube_script_tracker/apps.py new file mode 100644 index 0000000..6565dda --- /dev/null +++ b/cppa_youtube_script_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class CppaYoutubeScriptTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "cppa_youtube_script_tracker" + verbose_name = "CPPA YouTube Script Tracker" diff --git a/cppa_youtube_script_tracker/fetcher.py b/cppa_youtube_script_tracker/fetcher.py new file mode 100644 index 0000000..60e43c6 --- /dev/null +++ b/cppa_youtube_script_tracker/fetcher.py @@ -0,0 +1,244 @@ +""" +YouTube Data API v3 fetcher for cppa_youtube_script_tracker. + +Adapted from cppa-brain-backend/copilot_data/scrape/youtube_cpp/scraper.py. +Fetches video metadata for C++ channels between published_after and published_before. +""" + +from __future__ import annotations + +import logging +import re +import time +from datetime import datetime, timezone +from typing import Any, Optional + +from django.conf import settings + +logger = logging.getLogger(__name__) + +# Maps channel title to stable YouTube channel ID. +C_PLUS_PLUS_CHANNELS: dict[str, str] = { + "CppCon": "UCMlGfpWw-RUdWX_JbLCukXg", + "Meeting C++": "UCX9pk4YzHFcl3MsHIYBlEKg", + "C++Now": "UCEfngwe09zvd9KAL33YJSQQ", + "Jason Turner": "UCXTpTQHR7li1_HkUyAIUjkQ", + "TheCherno": "UCQ-W1KE9EYfdxhL6S4twUNw", + "Bo Qian": "UCEqgmyWChwmqyRdmnsS24Zw", +} + +_MAX_RESULTS_PER_PAGE = 50 +_DELAY_SECONDS = 0.5 + + +def _get_api_key() -> str: + """Return YOUTUBE_API_KEY from Django settings. Raises ValueError if missing.""" + key = (getattr(settings, "YOUTUBE_API_KEY", None) or "").strip() + if not key: + raise ValueError( + "YOUTUBE_API_KEY is not set. Add it to your .env or Django settings." + ) + return key + + +def _parse_duration_iso(duration_iso: str) -> int: + """Parse ISO 8601 duration string (e.g. PT1H2M10S) to total seconds.""" + if not duration_iso or duration_iso == "PT": + return 0 + match = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?").match(duration_iso) + if not match: + return 0 + return ( + int(match.group(1) or 0) * 3600 + + int(match.group(2) or 0) * 60 + + int(match.group(3) or 0) + ) + + +def _format_video_data( + video_data: dict[str, Any], search_term: str = "" +) -> dict[str, Any]: + """Normalise a YouTube API video resource into a flat metadata dict.""" + snippet = video_data.get("snippet", {}) + statistics = video_data.get("statistics", {}) + content_details = video_data.get("contentDetails", {}) + duration_iso = content_details.get("duration", "PT0S") + view = statistics.get("viewCount") + like = statistics.get("likeCount") + comment = statistics.get("commentCount") + return { + "video_id": video_data.get("id", ""), + "title": snippet.get("title", ""), + "description": snippet.get("description", ""), + "channel_id": snippet.get("channelId", ""), + "channel_title": snippet.get("channelTitle", ""), + "published_at": snippet.get("publishedAt", ""), + "duration_seconds": _parse_duration_iso(duration_iso), + "view_count": int(view) if view is not None else None, + "like_count": int(like) if like is not None else None, + "comment_count": int(comment) if comment is not None else None, + "tags": snippet.get("tags") or [], + "search_term": search_term, + "scraped_at": datetime.now(tz=timezone.utc).isoformat(), + } + + +def _to_rfc3339(dt: datetime) -> str: + """Format a datetime as RFC 3339 (required by YouTube API publishedAfter/Before).""" + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _build_queries(channel_title: Optional[str]) -> list[tuple[str, Optional[str]]]: + """Return list of (query_text, channel_id_or_None) pairs to iterate over.""" + if channel_title: + ch_id = C_PLUS_PLUS_CHANNELS.get(channel_title) + if not ch_id: + logger.warning( + "fetch_videos: channel_title %r not in C_PLUS_PLUS_CHANNELS; " + "falling back to keyword search", + channel_title, + ) + return [(channel_title, None)] + return [("C++", ch_id)] + return [("C++", ch_id) for ch_id in C_PLUS_PLUS_CHANNELS.values()] + + +def _fetch_search_page( + youtube: Any, + query_text: str, + ch_id: Optional[str], + after_str: str, + before_str: str, + page_token: Optional[str], +) -> Optional[dict[str, Any]]: + """Execute one search().list() call; return the response or None on error.""" + params: dict[str, Any] = { + "q": query_text, + "part": "id,snippet", + "type": "video", + "maxResults": _MAX_RESULTS_PER_PAGE, + "order": "date", + "publishedAfter": after_str, + "publishedBefore": before_str, + } + if ch_id: + params["channelId"] = ch_id + if page_token: + params["pageToken"] = page_token + try: + time.sleep(_DELAY_SECONDS) + return youtube.search().list(**params).execute() # type: ignore[union-attr] + except Exception as exc: # pylint: disable=broad-exception-caught + logger.error("fetch_videos: search API error: %s", exc) + return None + + +def _fetch_video_details(youtube: Any, video_ids: list[str]) -> list[dict[str, Any]]: + """Execute one videos().list() call; return items or empty list on error.""" + try: + time.sleep(_DELAY_SECONDS) + resp = ( + youtube.videos() # type: ignore[union-attr] + .list(part="snippet,statistics,contentDetails", id=",".join(video_ids)) + .execute() + ) + return resp.get("items", []) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.error("fetch_videos: videos.list API error: %s", exc) + return [] + + +def _process_one_channel_query( + youtube: Any, + query_text: str, + ch_id: Optional[str], + after_str: str, + before_str: str, + seen_ids: set[str], + min_duration_seconds: int, +) -> list[dict[str, Any]]: + """Paginate through search results for one (query, channel) pair. Returns new video dicts.""" + collected: list[dict[str, Any]] = [] + page_token: Optional[str] = None + while True: + response = _fetch_search_page( + youtube, query_text, ch_id, after_str, before_str, page_token + ) + if response is None: + break + + new_ids = [ + item["id"]["videoId"] + for item in response.get("items", []) + if item.get("id", {}).get("kind") == "youtube#video" + and item["id"]["videoId"] not in seen_ids + ] + + for vdata in _fetch_video_details(youtube, new_ids) if new_ids else []: + vid = vdata.get("id", "") + if not vid or vid in seen_ids: + continue + duration = _parse_duration_iso( + vdata.get("contentDetails", {}).get("duration", "PT0S") + ) + if min_duration_seconds and duration < min_duration_seconds: + continue + seen_ids.add(vid) + collected.append(_format_video_data(vdata, search_term=query_text)) + + page_token = response.get("nextPageToken") + if not page_token: + break + return collected + + +def fetch_videos( + published_after: datetime, + published_before: datetime, + channel_title: Optional[str] = None, + skip_video_ids: Optional[set[str]] = None, + min_duration_seconds: int = 0, +) -> list[dict[str, Any]]: + """Fetch video metadata from the YouTube Data API v3. + + Args: + published_after: Fetch videos published after this time. + published_before: Fetch videos published before this time. + channel_title: If given, restrict to that channel (key in C_PLUS_PLUS_CHANNELS + or fallback keyword search). + skip_video_ids: Video IDs already in DB (skipped). + min_duration_seconds: Skip videos shorter than this. + + Returns: + List of normalised video metadata dicts. + """ + try: + from googleapiclient.discovery import build + except ImportError as exc: + raise ImportError( + "google-api-python-client is required: pip install google-api-python-client" + ) from exc + + youtube = build("youtube", "v3", developerKey=_get_api_key()) + after_str = _to_rfc3339(published_after) + before_str = _to_rfc3339(published_before) + seen_ids: set[str] = set(skip_video_ids or set()) + all_videos: list[dict[str, Any]] = [] + + for query_text, ch_id in _build_queries(channel_title): + all_videos.extend( + _process_one_channel_query( + youtube, + query_text, + ch_id, + after_str, + before_str, + seen_ids, + min_duration_seconds, + ) + ) + + logger.info("fetch_videos: fetched %d videos", len(all_videos)) + return all_videos diff --git a/cppa_youtube_script_tracker/management/__init__.py b/cppa_youtube_script_tracker/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cppa_youtube_script_tracker/management/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/management/commands/__init__.py b/cppa_youtube_script_tracker/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cppa_youtube_script_tracker/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py new file mode 100644 index 0000000..1ae922e --- /dev/null +++ b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py @@ -0,0 +1,492 @@ +""" +Management command: run_cppa_youtube_script_tracker + +4-phase pipeline: + Phase 1: Process existing metadata queue JSONs → persist to DB → + move JSON to raw/metadata/ (permanent archive). + Phase 2: Determine start_time, fetch video metadata from YouTube Data API v3, + write to metadata queue (short-lived), persist to DB, + move JSON to raw/metadata/ (permanent archive). + Phase 3: Download VTT transcripts via yt-dlp for videos with has_transcript=False; + save directly to raw/transcripts/ (never deleted). + Phase 4: Pinecone upsert via run_cppa_pinecone_sync. +""" + +from __future__ import annotations + +import json +import logging +import os +import shutil +from datetime import datetime, timezone +from typing import Optional + +from django.conf import settings +from django.core.management import call_command +from django.core.management.base import BaseCommand +from django.utils.dateparse import parse_datetime + +from cppa_user_tracker.services import get_or_create_youtube_speaker +from cppa_youtube_script_tracker.fetcher import fetch_videos +from cppa_youtube_script_tracker.models import YouTubeVideo +from cppa_youtube_script_tracker.preprocessor import preprocess_youtube_for_pinecone +from cppa_youtube_script_tracker.services import ( + get_or_create_channel, + get_or_create_tag, + get_or_create_video, + link_speaker_to_video, + link_tag_to_video, + update_video_transcript, +) +from cppa_youtube_script_tracker.transcript import download_vtt +from cppa_youtube_script_tracker.workspace import ( + get_metadata_queue_path, + get_raw_metadata_path, + get_raw_transcripts_dir, + iter_metadata_queue_jsons, +) + +logger = logging.getLogger(__name__) + +PINECONE_NAMESPACE_ENV_KEY = "YOUTUBE_PINECONE_NAMESPACE" +_DEFAULT_PINECONE_NAMESPACE = "youtube-scripts" + +YOUTUBE_COOKIES_FILE = os.getenv("YOUTUBE_COOKIES_FILE", "youtube_cookies.txt") + + +def _clean_text(value: object) -> str: + """Return DB-safe text (PostgreSQL rejects NUL bytes).""" + if value is None: + return "" + value = str(value).replace("\x00", "").replace("\u2019", "'") + + return value + + +def _extract_speakers_from_title(title: str) -> list[str]: + """Heuristic: extract speaker names from talk titles like 'Topic - Speaker Name'. + + Returns a list of candidate names (may be empty if no pattern matched). + """ + if not title: + return [] + for sep in (" - ", " — ", " | "): + if sep in title: + candidate = title.split(sep)[-1].strip() + if candidate and len(candidate) < 80 and " " in candidate: + return [candidate] + return [] + + +def _move_to_raw(video_id: str, queue_path) -> None: + """Move a metadata JSON from queue to raw/metadata/ (permanent archive).""" + try: + raw_path = get_raw_metadata_path(video_id) + raw_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(queue_path), str(raw_path)) + except Exception: + logger.warning( + "_move_to_raw: could not move %s to raw/metadata/, removing instead", + queue_path, + ) + try: + queue_path.unlink(missing_ok=True) + except Exception: + pass + + +def _persist_video(video_data: dict) -> tuple[bool, bool]: + """Persist one video metadata dict to DB. Returns (created, skipped).""" + video_id = _clean_text(video_data.get("video_id", "")).strip() + if not video_id: + return False, True + + channel_id = _clean_text(video_data.get("channel_id", "")).strip() + channel_title = _clean_text(video_data.get("channel_title", "")).strip() + channel = get_or_create_channel(channel_id, channel_title) if channel_id else None + + metadata = { + "title": _clean_text(video_data.get("title", "")), + "description": _clean_text(video_data.get("description", "")), + "published_at": video_data.get("published_at"), + "duration_seconds": video_data.get("duration_seconds", 0), + "view_count": video_data.get("view_count"), + "like_count": video_data.get("like_count"), + "comment_count": video_data.get("comment_count"), + "search_term": _clean_text(video_data.get("search_term", "")), + "scraped_at": video_data.get("scraped_at"), + } + + try: + video, created = get_or_create_video( + video_id=video_id, channel=channel, metadata_dict=metadata + ) + except Exception: + logger.exception("_persist_video: failed to persist video_id=%s", video_id) + return False, True + + if created: + for name in _extract_speakers_from_title( + _clean_text(video_data.get("title", "")) + ): + try: + speaker, _ = get_or_create_youtube_speaker(display_name=name) + link_speaker_to_video(video, speaker) + except Exception: + logger.warning( + "_persist_video: could not link speaker %r to video %s", + name, + video_id, + ) + + for raw_tag in video_data.get("tags") or []: + tag_name = _clean_text(raw_tag).strip() + if not tag_name: + continue + try: + tag = get_or_create_tag(tag_name) + link_tag_to_video(video, tag) + except Exception: + logger.warning( + "_persist_video: could not link tag %r to video %s", + tag_name, + video_id, + ) + + return created, False + + +def _process_queue() -> tuple[int, int]: + """Phase 1: load each metadata queue JSON, persist to DB, move to raw/metadata/. + + Returns (files_processed, videos_skipped). + """ + processed = 0 + skipped = 0 + for path in iter_metadata_queue_jsons(): + try: + data = json.loads(path.read_text(encoding="utf-8")) + items = data if isinstance(data, list) else [data] + persist_ok = True + last_video_id = "" + for item in items: + try: + _, was_skipped = _persist_video(item) + last_video_id = item.get("video_id", "") + if was_skipped: + skipped += 1 + except Exception: + persist_ok = False + logger.exception( + "_process_queue: persist failed for video_id=%s in %s", + item.get("video_id", "?"), + path, + ) + skipped += 1 + if persist_ok: + _move_to_raw(last_video_id or path.stem, path) + processed += 1 + except Exception: + logger.exception("_process_queue: failed to read %s", path) + return processed, skipped + + +def _get_start_time_from_db() -> Optional[datetime]: + """Return the latest published_at from YouTubeVideo, or None if table is empty.""" + latest = YouTubeVideo.objects.order_by("-published_at").first() + return latest.published_at if latest and latest.published_at else None + + +def _resolve_start_time(start_time_arg: str, dry_run: bool) -> datetime: + """Resolve the start_time for Phase 2 fetch. + + Priority: CLI arg → latest DB record → YOUTUBE_DEFAULT_PUBLISHED_AFTER → 2015-01-01. + """ + if start_time_arg: + dt = parse_datetime(start_time_arg) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + if not dry_run: + db_dt = _get_start_time_from_db() + if db_dt: + logger.info( + "run_cppa_youtube_script_tracker: using start_time from DB: %s", db_dt + ) + return db_dt + + default_after = ( + getattr(settings, "YOUTUBE_DEFAULT_PUBLISHED_AFTER", None) or "" + ).strip() + if default_after: + dt = parse_datetime(default_after) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + fallback = datetime(2015, 1, 1, tzinfo=timezone.utc) + logger.warning( + "run_cppa_youtube_script_tracker: no start_time available; defaulting to %s", + fallback, + ) + return fallback + + +def _resolve_end_time(end_time_arg: str) -> datetime: + """Parse end_time CLI arg or default to now().""" + if end_time_arg: + dt = parse_datetime(end_time_arg) + if dt: + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + return datetime.now(tz=timezone.utc) + + +def _persist_fetched_video(vdata: dict) -> tuple[bool, bool]: + """Write video to metadata queue/, persist to DB, move to raw/metadata/. Returns (created, skipped).""" + vid = vdata.get("video_id", "") + if not vid: + return False, True + + queue_path = get_metadata_queue_path(vid) + queue_path.parent.mkdir(parents=True, exist_ok=True) + queue_path.write_text(json.dumps(vdata, indent=2, default=str), encoding="utf-8") + + try: + was_created, was_skipped = _persist_video(vdata) + _move_to_raw(vid, queue_path) + return was_created, was_skipped + except Exception: + logger.exception( + "run_cppa_youtube_script_tracker: Phase 2 persist failed for video_id=%s", + vid, + ) + return False, True + + +def _run_phase_2( + start_time: datetime, + end_time: datetime, + channel_title: str, +) -> tuple[int, int]: + """Fetch new videos and persist them. Returns (created_count, skipped_count).""" + existing_ids: set[str] = set( + YouTubeVideo.objects.values_list("video_id", flat=True) + ) + videos = fetch_videos( + published_after=start_time, + published_before=end_time, + channel_title=channel_title or None, + skip_video_ids=existing_ids, + ) + created_count = 0 + skipped_count = 0 + for vdata in videos: + was_created, was_skipped = _persist_fetched_video(vdata) + if was_created: + created_count += 1 + elif was_skipped: + skipped_count += 1 + return created_count, skipped_count + + +def _run_phase_3() -> tuple[int, int]: + """Download VTT transcripts for videos that don't have one yet. + + Saves directly to raw/transcripts/ (never deleted). + Returns (ok_count, fail_count). + """ + pending = list( + YouTubeVideo.objects.filter(has_transcript=False).values_list( + "video_id", flat=True + ) + ) + transcripts_dir = get_raw_transcripts_dir() + ok = 0 + fail = 0 + for vid in pending: + try: + vtt_path = download_vtt( + vid, output_dir=transcripts_dir, cookies_file=YOUTUBE_COOKIES_FILE + ) + if vtt_path: + video_obj = YouTubeVideo.objects.get(video_id=vid) + update_video_transcript(video_obj, str(vtt_path)) + ok += 1 + else: + fail += 1 + except Exception: + fail += 1 + logger.exception( + "run_cppa_youtube_script_tracker: transcript download failed for %s", + vid, + ) + return ok, fail + + +def _run_pinecone_sync(app_id: str, namespace: str) -> None: + """Trigger run_cppa_pinecone_sync if app_id and namespace are set.""" + if not app_id: + logger.warning("Pinecone sync skipped: --pinecone-app-id is empty.") + return + if not namespace: + logger.warning( + "Pinecone sync skipped: namespace is empty (set --pinecone-namespace or %s).", + PINECONE_NAMESPACE_ENV_KEY, + ) + return + try: + call_command( + "run_cppa_pinecone_sync", + app_id=app_id, + namespace=namespace, + preprocess_fn=preprocess_youtube_for_pinecone, + ) + logger.info( + "run_cppa_youtube_script_tracker: Pinecone sync complete (app_id=%s, namespace=%s)", + app_id, + namespace, + ) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning( + "Pinecone sync skipped/failed (run_cppa_pinecone_sync unavailable or errored): %s", + exc, + ) + + +class Command(BaseCommand): + help = ( + "Fetch YouTube C++ video metadata and transcripts, persist to DB, " + "then optionally upsert to Pinecone. " + "Processes existing metadata queue JSONs first, then fetches from the YouTube Data API." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--start-time", + type=str, + default="", + help=( + "ISO datetime string; fetch videos published after this time. " + "Default: latest published_at in DB (after Phase 1), " + "or YOUTUBE_DEFAULT_PUBLISHED_AFTER env var if DB is empty." + ), + ) + parser.add_argument( + "--end-time", + type=str, + default="", + help="ISO datetime string; fetch videos published before this time. Default: now().", + ) + parser.add_argument( + "--channel-title", + type=str, + default="", + help=( + "Restrict scraping to a specific channel title " + "(must match a key in fetcher.C_PLUS_PLUS_CHANNELS or search by name)." + ), + ) + parser.add_argument( + "--dry-run", action="store_true", help="Skip DB writes and API calls." + ) + parser.add_argument( + "--skip-transcript", action="store_true", help="Skip Phase 3." + ) + parser.add_argument( + "--pinecone-app-id", + type=str, + default="youtube", + help="App ID passed to run_cppa_pinecone_sync.", + ) + parser.add_argument( + "--pinecone-namespace", + type=str, + default=os.getenv(PINECONE_NAMESPACE_ENV_KEY, _DEFAULT_PINECONE_NAMESPACE), + help=f"Pinecone namespace. Default from env {PINECONE_NAMESPACE_ENV_KEY}.", + ) + + def handle(self, *args, **options): + start_time_arg = (options.get("start_time") or "").strip() + end_time_arg = (options.get("end_time") or "").strip() + channel_title = (options.get("channel_title") or "").strip() + dry_run: bool = options["dry_run"] + skip_transcript: bool = options["skip_transcript"] + pinecone_app_id = (options.get("pinecone_app_id") or "").strip() + pinecone_namespace = (options.get("pinecone_namespace") or "").strip() + + logger.info( + "run_cppa_youtube_script_tracker: starting " + "(start_time=%s, end_time=%s, channel_title=%s, dry_run=%s, skip_transcript=%s)", + start_time_arg or "auto", + end_time_arg or "now", + channel_title or "all", + dry_run, + skip_transcript, + ) + + try: + self._phase_1(dry_run) + start_time = _resolve_start_time(start_time_arg, dry_run) + end_time = _resolve_end_time(end_time_arg) + + self.stdout.write( + f"Phase 2: fetching videos {start_time.isoformat()} → {end_time.isoformat()} …" + ) + + if dry_run: + self.stdout.write( + self.style.SUCCESS( + f"Dry run: would fetch from {start_time.isoformat()} to " + f"{end_time.isoformat()}. No API calls or DB writes." + ) + ) + return + + self._phase_2(start_time, end_time, channel_title) + self._phase_3(skip_transcript) + _run_pinecone_sync(app_id=pinecone_app_id, namespace=pinecone_namespace) + + except Exception: + logger.exception("run_cppa_youtube_script_tracker: unhandled error") + raise + + def _phase_1(self, dry_run: bool) -> None: + if dry_run: + return + files_processed, videos_skipped = _process_queue() + self.stdout.write( + f"Phase 1: processed {files_processed} queue file(s); {videos_skipped} video(s) skipped." + ) + logger.info( + "run_cppa_youtube_script_tracker: Phase 1 done; queue_files=%d, skipped=%d", + files_processed, + videos_skipped, + ) + + def _phase_2( + self, start_time: datetime, end_time: datetime, channel_title: str + ) -> None: + created_count, skipped_count = _run_phase_2(start_time, end_time, channel_title) + if created_count == 0 and skipped_count == 0: + self.stdout.write(self.style.WARNING("Phase 2: no new videos fetched.")) + logger.info("run_cppa_youtube_script_tracker: Phase 2 — no new videos") + else: + self.stdout.write( + self.style.SUCCESS( + f"Phase 2 done: {created_count} created, {skipped_count} skipped." + ) + ) + logger.info( + "run_cppa_youtube_script_tracker: Phase 2 done; created=%d, skipped=%d", + created_count, + skipped_count, + ) + + def _phase_3(self, skip_transcript: bool) -> None: + if skip_transcript: + self.stdout.write("Phase 3: skipped (--skip-transcript).") + return + ok, fail = _run_phase_3() + self.stdout.write(f"Phase 3 done: {ok} downloaded, {fail} unavailable.") + logger.info( + "run_cppa_youtube_script_tracker: Phase 3 done; ok=%d, fail=%d", ok, fail + ) diff --git a/cppa_youtube_script_tracker/migrations/0001_initial.py b/cppa_youtube_script_tracker/migrations/0001_initial.py new file mode 100644 index 0000000..843b0b3 --- /dev/null +++ b/cppa_youtube_script_tracker/migrations/0001_initial.py @@ -0,0 +1,176 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("cppa_user_tracker", "0005_youtubespeaker_alter_baseprofile_type"), + ] + + operations = [ + migrations.CreateModel( + name="YouTubeChannel", + fields=[ + ( + "channel_id", + models.CharField(max_length=64, primary_key=True, serialize=False), + ), + ("channel_title", models.CharField(blank=True, max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "verbose_name": "YouTube channel", + "verbose_name_plural": "YouTube channels", + "ordering": ["channel_title"], + }, + ), + migrations.CreateModel( + name="CppaTags", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("tag_name", models.CharField(db_index=True, max_length=128, unique=True)), + ], + options={ + "verbose_name": "CPPA tag", + "verbose_name_plural": "CPPA tags", + "ordering": ["tag_name"], + }, + ), + migrations.CreateModel( + name="YouTubeVideo", + fields=[ + ( + "video_id", + models.CharField(max_length=32, primary_key=True, serialize=False), + ), + ( + "channel", + models.ForeignKey( + blank=True, + db_column="channel_id", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="videos", + to="cppa_youtube_script_tracker.youtubechannel", + ), + ), + ("title", models.CharField(blank=True, max_length=512)), + ("description", models.TextField(blank=True)), + ("published_at", models.DateTimeField(blank=True, db_index=True, null=True)), + ("duration_seconds", models.IntegerField(default=0)), + ("view_count", models.IntegerField(blank=True, null=True)), + ("like_count", models.IntegerField(blank=True, null=True)), + ("comment_count", models.IntegerField(blank=True, null=True)), + ("search_term", models.CharField(blank=True, max_length=255)), + ("has_transcript", models.BooleanField(default=False)), + ("transcript_path", models.CharField(blank=True, max_length=1024)), + ("scraped_at", models.DateTimeField(blank=True, null=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "verbose_name": "YouTube video", + "verbose_name_plural": "YouTube videos", + "ordering": ["-published_at"], + }, + ), + migrations.CreateModel( + name="YouTubeVideoSpeaker", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "video", + models.ForeignKey( + db_column="video_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_speakers", + to="cppa_youtube_script_tracker.youtubevideo", + ), + ), + ( + "speaker", + models.ForeignKey( + db_column="speaker_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_appearances", + to="cppa_user_tracker.youtubespeaker", + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ], + options={ + "verbose_name": "YouTube video speaker", + "verbose_name_plural": "YouTube video speakers", + "ordering": ["video", "speaker"], + }, + ), + migrations.AddConstraint( + model_name="youtubevideospeaker", + constraint=models.UniqueConstraint( + fields=["video", "speaker"], name="unique_video_speaker" + ), + ), + migrations.CreateModel( + name="YouTubeVideoTags", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "youtube_video", + models.ForeignKey( + db_column="youtube_video_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="video_tags", + to="cppa_youtube_script_tracker.youtubevideo", + ), + ), + ( + "cppa_tag", + models.ForeignKey( + db_column="cppa_tag_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="tagged_videos", + to="cppa_youtube_script_tracker.cppatags", + ), + ), + ], + options={ + "verbose_name": "YouTube video tag", + "verbose_name_plural": "YouTube video tags", + "ordering": ["youtube_video", "cppa_tag"], + }, + ), + migrations.AddConstraint( + model_name="youtubevideotags", + constraint=models.UniqueConstraint( + fields=["youtube_video", "cppa_tag"], name="unique_video_tag" + ), + ), + ] diff --git a/cppa_youtube_script_tracker/migrations/__init__.py b/cppa_youtube_script_tracker/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cppa_youtube_script_tracker/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/models.py b/cppa_youtube_script_tracker/models.py new file mode 100644 index 0000000..d852ad9 --- /dev/null +++ b/cppa_youtube_script_tracker/models.py @@ -0,0 +1,145 @@ +""" +Models per docs/Schema.md — cppa_youtube_script_tracker section. + +Tables: +- YouTubeChannel: publisher channel (e.g. CppCon, C++Now); channel_id is PK +- YouTubeVideo: individual video metadata + transcript state; video_id is PK +- YouTubeVideoSpeaker: M2M join between YouTubeVideo and cppa_user_tracker.YoutubeSpeaker +- CppaTags: C++ community tag vocabulary +- YouTubeVideoTags: M2M join between YouTubeVideo and CppaTags +""" + +from django.db import models + + +class YouTubeChannel(models.Model): + """Publishing channel a video was uploaded to (e.g. CppCon, C++Now). + + channel_id is the YouTube channel ID and serves as the primary key. + """ + + channel_id = models.CharField(max_length=64, primary_key=True) + channel_title = models.CharField(max_length=255, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["channel_title"] + verbose_name = "YouTube channel" + verbose_name_plural = "YouTube channels" + + def __str__(self) -> str: + return self.channel_title or self.channel_id + + +class YouTubeVideo(models.Model): + """YouTube video metadata and transcript download state. + + video_id is the YouTube video ID and serves as the primary key. + """ + + video_id = models.CharField(max_length=32, primary_key=True) + channel = models.ForeignKey( + YouTubeChannel, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="videos", + db_column="channel_id", + ) + title = models.CharField(max_length=512, blank=True) + description = models.TextField(blank=True) + published_at = models.DateTimeField(null=True, blank=True, db_index=True) + duration_seconds = models.IntegerField(default=0) + view_count = models.IntegerField(null=True, blank=True) + like_count = models.IntegerField(null=True, blank=True) + comment_count = models.IntegerField(null=True, blank=True) + search_term = models.CharField(max_length=255, blank=True) + has_transcript = models.BooleanField(default=False) + transcript_path = models.CharField(max_length=1024, blank=True) + scraped_at = models.DateTimeField(null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-published_at"] + verbose_name = "YouTube video" + verbose_name_plural = "YouTube videos" + + def __str__(self) -> str: + return self.title or self.video_id + + +class YouTubeVideoSpeaker(models.Model): + """M2M join: links a YouTubeVideo to a YoutubeSpeaker profile.""" + + video = models.ForeignKey( + YouTubeVideo, + on_delete=models.CASCADE, + related_name="video_speakers", + db_column="video_id", + ) + speaker = models.ForeignKey( + "cppa_user_tracker.YoutubeSpeaker", + on_delete=models.CASCADE, + related_name="video_appearances", + db_column="speaker_id", + ) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["video", "speaker"], name="unique_video_speaker" + ) + ] + ordering = ["video", "speaker"] + verbose_name = "YouTube video speaker" + verbose_name_plural = "YouTube video speakers" + + def __str__(self) -> str: + return f"video={self.video_id} speaker={self.speaker_id}" # type: ignore[attr-defined] + + +class CppaTags(models.Model): + """C++ community tag vocabulary (e.g. 'concurrency', 'templates', 'modules').""" + + tag_name = models.CharField(max_length=128, unique=True, db_index=True) + + class Meta: + ordering = ["tag_name"] + verbose_name = "CPPA tag" + verbose_name_plural = "CPPA tags" + + def __str__(self) -> str: + return self.tag_name + + +class YouTubeVideoTags(models.Model): + """M2M join: links a YouTubeVideo to a CppaTags entry.""" + + youtube_video = models.ForeignKey( + YouTubeVideo, + on_delete=models.CASCADE, + related_name="video_tags", + db_column="youtube_video_id", + ) + cppa_tag = models.ForeignKey( + CppaTags, + on_delete=models.CASCADE, + related_name="tagged_videos", + db_column="cppa_tag_id", + ) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=["youtube_video", "cppa_tag"], name="unique_video_tag" + ) + ] + ordering = ["youtube_video", "cppa_tag"] + verbose_name = "YouTube video tag" + verbose_name_plural = "YouTube video tags" + + def __str__(self) -> str: + return f"video={self.youtube_video_id} tag={self.cppa_tag_id}" # type: ignore[attr-defined] diff --git a/cppa_youtube_script_tracker/preprocessor.py b/cppa_youtube_script_tracker/preprocessor.py new file mode 100644 index 0000000..41983aa --- /dev/null +++ b/cppa_youtube_script_tracker/preprocessor.py @@ -0,0 +1,166 @@ +""" +Pinecone preprocess function for cppa_youtube_script_tracker. + +Guideline source: docs/Pinecone_preprocess_guideline_c.md + +Returns whole-document payloads (is_chunked=False) so the sync pipeline can +apply its configured chunking strategy. +""" + +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Any + +from django.db.models import Q + +from .models import YouTubeVideo + + +def _normalize_failed_ids(failed_ids: list[str]) -> list[str]: + """Return stripped, non-empty, de-duplicated failed IDs preserving order.""" + seen: set[str] = set() + out: list[str] = [] + for raw in failed_ids: + value = (raw or "").strip() + if not value or value in seen: + continue + seen.add(value) + out.append(value) + return out + + +def _read_vtt(transcript_path: str) -> str: + """Return plain text from a .vtt file, stripping VTT header/timestamps. + + Returns empty string if the file does not exist or cannot be read. + """ + path = Path(transcript_path) + if not path.exists(): + return "" + try: + raw = path.read_text(encoding="utf-8", errors="replace") + except OSError: + return "" + + lines: list[str] = [] + for line in raw.splitlines(): + line = line.strip() + # Skip WEBVTT header, NOTE blocks, blank lines, and timestamp lines + if not line: + continue + if line.startswith("WEBVTT") or line.startswith("NOTE"): + continue + # Timestamp lines: "00:00:00.000 --> 00:00:05.000" or similar + if "-->" in line: + continue + # Cue-setting lines (e.g. "align:start position:0%") + if line.startswith("align:") or line.startswith("position:"): + continue + lines.append(line) + + return " ".join(lines).strip() + + +def _get_speaker_names(video: YouTubeVideo) -> list[str]: + """Return a sorted list of speaker display_names linked to this video.""" + names = list( + video.video_speakers.select_related("speaker") + .values_list("speaker__display_name", flat=True) + .order_by("speaker__display_name") + ) + return [n for n in names if n] + + +def _build_document_content(video: YouTubeVideo, speaker_names: list[str]) -> str: + """Build plain-text content for embedding.""" + parts: list[str] = [] + + if video.title: + parts.append(f"Title: {video.title.strip()}") + if speaker_names: + parts.append(f"Speakers: {', '.join(speaker_names)}") + if video.channel and video.channel.channel_title: + parts.append(f"Channel: {video.channel.channel_title.strip()}") + if video.published_at: + parts.append(f"Published: {video.published_at.isoformat()}") + + description = (video.description or "").strip() + if description: + parts.append(f"\nDescription:\n{description}") + + if video.has_transcript and video.transcript_path: + transcript_text = _read_vtt(video.transcript_path) + if transcript_text: + parts.append(f"\nTranscript:\n{transcript_text}") + + return "\n".join(parts).strip() + + +def _build_candidate_queryset(normalized_failed: list[str], final_sync_at: datetime | None): + """Return the ORM queryset of candidates to preprocess.""" + queryset = YouTubeVideo._default_manager.select_related("channel").prefetch_related( # type: ignore[attr-defined] + "video_speakers__speaker" + ) + if final_sync_at is None and not normalized_failed: + return queryset.order_by("id") + criteria = Q() + if final_sync_at is not None: + criteria |= Q(created_at__gt=final_sync_at) + if normalized_failed: + criteria |= Q(video_id__in=normalized_failed) + return queryset.filter(criteria).order_by("id") + + +def _build_video_metadata(video: YouTubeVideo, speaker_names: list[str]) -> dict[str, Any]: + """Build the Pinecone metadata dict for one video.""" + channel_title = (video.channel.channel_title if video.channel else "") or "" + return { + "doc_id": f"youtube-{video.video_id}", + "ids": str(video.pk), + "type": "youtube", + "url": f"https://www.youtube.com/watch?v={video.video_id}", + "title": video.title or "", + "author": ", ".join(speaker_names), + "channel": channel_title, + "timestamp": int(video.published_at.timestamp()) if video.published_at else 0, + "has_transcript": video.has_transcript, + } + + +def preprocess_youtube_for_pinecone( + failed_ids: list[str], + final_sync_at: datetime | None, +) -> tuple[list[dict[str, Any]], bool]: + """Build Pinecone sync documents for YouTube videos. + + Args: + failed_ids: Previous-run failed source IDs (video_id values). + final_sync_at: Last sync timestamp for incremental sync; None means first sync. + + Returns: + (documents, is_chunked) + - documents: list[{"content": str, "metadata": dict}] + - is_chunked: False (whole docs; pipeline may chunk later) + """ + normalized_failed = _normalize_failed_ids(failed_ids or []) + candidates = _build_candidate_queryset(normalized_failed, final_sync_at) + + docs: list[dict[str, Any]] = [] + seen_video_ids: set[str] = set() + + for video in candidates: + vid = (video.video_id or "").strip() + if not vid or vid in seen_video_ids: + continue + seen_video_ids.add(vid) + + speaker_names = _get_speaker_names(video) + content = _build_document_content(video, speaker_names) + if not content: + continue + + docs.append({"content": content, "metadata": _build_video_metadata(video, speaker_names)}) + + return docs, False diff --git a/cppa_youtube_script_tracker/services.py b/cppa_youtube_script_tracker/services.py new file mode 100644 index 0000000..0cacba1 --- /dev/null +++ b/cppa_youtube_script_tracker/services.py @@ -0,0 +1,131 @@ +""" +Service layer for cppa_youtube_script_tracker. + +All creates/updates/deletes for this app's models must go through functions in this +module. Do not call Model.objects.create(), model.save(), or model.delete() from +outside this module. + +See docs/Contributing.md for the project-wide rule. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from .models import CppaTags, YouTubeChannel, YouTubeVideo, YouTubeVideoSpeaker, YouTubeVideoTags + + +def _parse_dt_field(value: Any) -> Any: + """Parse a datetime string field; returns datetime, None, or the original value.""" + if isinstance(value, str) and value: + from django.utils.dateparse import parse_datetime as _pd + return _pd(value) + return value + + +def get_or_create_channel( + channel_id: str, + channel_title: str = "", +) -> YouTubeChannel: + """Get or create a YouTubeChannel by channel_id (PK). + + If the channel exists and channel_title differs, the title is updated. + Returns the YouTubeChannel instance. + """ + channel_id_val = (channel_id or "").strip() + channel_title_val = (channel_title or "").strip() + channel, created = YouTubeChannel.objects.get_or_create( + channel_id=channel_id_val, + defaults={"channel_title": channel_title_val}, + ) + if not created and channel_title_val and channel.channel_title != channel_title_val: + channel.channel_title = channel_title_val + channel.save(update_fields=["channel_title", "updated_at"]) + return channel + + +def get_or_create_video( + video_id: str, + channel: Optional[YouTubeChannel], + metadata_dict: dict[str, Any], +) -> tuple[YouTubeVideo, bool]: + """Get or create a YouTubeVideo by video_id (PK). Returns (video, created). + + metadata_dict keys (all optional): + title, description, published_at (datetime or ISO str), duration_seconds, + view_count, like_count, comment_count, search_term, scraped_at. + + Raises ValueError if video_id is empty. + """ + video_id_val = (video_id or "").strip() + if not video_id_val: + raise ValueError("video_id must not be empty.") + + published_at = _parse_dt_field(metadata_dict.get("published_at")) + scraped_at = _parse_dt_field(metadata_dict.get("scraped_at")) + + defaults: dict[str, Any] = { + "channel": channel, + "title": (metadata_dict.get("title") or ""), + "description": (metadata_dict.get("description") or ""), + "published_at": published_at, + "duration_seconds": int(metadata_dict.get("duration_seconds") or 0), + "view_count": metadata_dict.get("view_count"), + "like_count": metadata_dict.get("like_count"), + "comment_count": metadata_dict.get("comment_count"), + "search_term": (metadata_dict.get("search_term") or ""), + "scraped_at": scraped_at, + } + video, created = YouTubeVideo.objects.get_or_create( + video_id=video_id_val, + defaults=defaults, + ) + return video, created + + +def update_video_transcript( + video: YouTubeVideo, + transcript_path: str, +) -> YouTubeVideo: + """Mark video as having a transcript and store its path. Returns the updated video.""" + video.has_transcript = True + video.transcript_path = (transcript_path or "").strip() + video.save(update_fields=["has_transcript", "transcript_path", "updated_at"]) + return video + + +def link_speaker_to_video( + video: YouTubeVideo, + speaker: Any, +) -> YouTubeVideoSpeaker: + """Link a YoutubeSpeaker to a YouTubeVideo (get-or-create). Returns YouTubeVideoSpeaker.""" + join, _ = YouTubeVideoSpeaker.objects.get_or_create( + video=video, + speaker=speaker, + ) + return join + + +def get_or_create_tag(tag_name: str) -> CppaTags: + """Get or create a CppaTags entry by tag_name. + + Raises ValueError if tag_name is empty. + Returns the CppaTags instance. + """ + tag_name_val = (tag_name or "").strip().lower() + if not tag_name_val: + raise ValueError("tag_name must not be empty.") + tag, _ = CppaTags.objects.get_or_create(tag_name=tag_name_val) + return tag + + +def link_tag_to_video( + video: YouTubeVideo, + tag: CppaTags, +) -> YouTubeVideoTags: + """Link a CppaTags entry to a YouTubeVideo (get-or-create). Returns YouTubeVideoTags.""" + join, _ = YouTubeVideoTags.objects.get_or_create( + youtube_video=video, + cppa_tag=tag, + ) + return join diff --git a/cppa_youtube_script_tracker/tests/__init__.py b/cppa_youtube_script_tracker/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cppa_youtube_script_tracker/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/cppa_youtube_script_tracker/transcript.py b/cppa_youtube_script_tracker/transcript.py new file mode 100644 index 0000000..f1230db --- /dev/null +++ b/cppa_youtube_script_tracker/transcript.py @@ -0,0 +1,87 @@ +""" +VTT transcript downloader for cppa_youtube_script_tracker. + +Adapted from cppa-brain-backend/copilot_data/scrape/youtube_cpp/scraper.py +(YouTubeCppScraper._content_download / _setup_ytdlp). +Uses yt-dlp to download auto-generated or manual English subtitles as .vtt files. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +_YDL_OPTS_BASE: dict = { + "skip_download": True, + "force_ipv4": True, + "writesubtitles": True, + "writeautomaticsub": True, + "subtitleslangs": ["en"], + "subtitlesformat": "vtt", + "quiet": True, + "no_warnings": True, + "ignore_no_formats_error": True, + "extractor_args": { + "youtube": ["player_client=tv,web_safari"], + }, +} + + +def download_vtt( + video_id: str, + output_dir: Path, + cookies_file: Optional[str] = None, +) -> Optional[Path]: + """Download the English VTT transcript for video_id into output_dir. + + Tries manual captions first, then auto-generated. Returns the Path to the + downloaded .vtt file on success, or None if no transcript was found. + + Args: + video_id: YouTube video ID (11 characters). + output_dir: Directory where the .vtt file will be written. + cookies_file: Optional path to a cookies.txt for authenticated requests. + + Returns: + Path to the downloaded file (e.g. output_dir/{video_id}.en.vtt), or None. + """ + try: + import yt_dlp + except ImportError as exc: + raise ImportError("yt-dlp is required: pip install yt-dlp") from exc + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + url = f"https://www.youtube.com/watch?v={video_id}" + outtmpl = str(output_dir / "%(id)s.%(ext)s") + + ydl_opts = dict(_YDL_OPTS_BASE) + ydl_opts["outtmpl"] = outtmpl + if cookies_file: + ydl_opts["cookiefile"] = cookies_file + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.error("download_vtt: yt-dlp error for %s: %s", video_id, exc) + return None + + # yt-dlp writes {video_id}.{lang}.vtt; try most common pattern first + expected = output_dir / f"{video_id}.en.vtt" + if expected.exists(): + logger.debug("download_vtt: found %s", expected) + return expected + + # Fallback: look for any .vtt file matching the video_id + matches = list(output_dir.glob(f"{video_id}*.vtt")) + if matches: + logger.debug("download_vtt: found %s (fallback glob)", matches[0]) + return matches[0] + + logger.info("download_vtt: no VTT transcript found for %s", video_id) + return None diff --git a/cppa_youtube_script_tracker/workspace.py b/cppa_youtube_script_tracker/workspace.py new file mode 100644 index 0000000..19345ba --- /dev/null +++ b/cppa_youtube_script_tracker/workspace.py @@ -0,0 +1,85 @@ +""" +Workspace paths for cppa_youtube_script_tracker. + +Layout: +- Metadata queue: workspace/cppa_youtube_script_tracker/metadata/{video_id}.json + (short-lived; moved to raw after DB persist) +- Raw metadata: workspace/raw/cppa_youtube_script_tracker/metadata/{video_id}.json + (permanent archive; never deleted) +- Raw transcripts: workspace/raw/cppa_youtube_script_tracker/transcripts/{video_id}.en.vtt + (permanent archive; never deleted) +""" + +from pathlib import Path + +from config.workspace import get_workspace_path + +_APP_SLUG = "cppa_youtube_script_tracker" +_RAW_APP_SLUG = f"raw/{_APP_SLUG}" + + +def get_workspace_root() -> Path: + """Return this app's workspace directory (workspace/cppa_youtube_script_tracker/).""" + return get_workspace_path(_APP_SLUG) + + +def get_raw_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/; creates if missing.""" + path = get_workspace_path(_RAW_APP_SLUG) + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_metadata_queue_dir() -> Path: + """Return workspace/cppa_youtube_script_tracker/metadata/; creates if missing. + + JSON files here are short-lived: moved to raw/metadata/ after DB persist. + """ + path = get_workspace_root() / "metadata" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_raw_metadata_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/metadata/; creates if missing. + + Permanent archive: JSON files are never deleted after being moved here. + """ + path = get_raw_dir() / "metadata" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_raw_transcripts_dir() -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/transcripts/; creates if missing. + + Permanent archive: VTT files are never deleted. + """ + path = get_raw_dir() / "transcripts" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_metadata_queue_path(video_id: str) -> Path: + """Return workspace/cppa_youtube_script_tracker/metadata/{video_id}.json.""" + return get_metadata_queue_dir() / f"{video_id}.json" + + +def get_raw_metadata_path(video_id: str) -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/metadata/{video_id}.json.""" + return get_raw_metadata_dir() / f"{video_id}.json" + + +def get_transcript_path(video_id: str, lang: str = "en") -> Path: + """Return workspace/raw/cppa_youtube_script_tracker/transcripts/{video_id}.{lang}.vtt.""" + return get_raw_transcripts_dir() / f"{video_id}.{lang}.vtt" + + +def iter_metadata_queue_jsons(): + """Yield Path for each *.json file in the metadata queue directory.""" + queue_dir = get_workspace_root() / "metadata" + if not queue_dir.is_dir(): + return + for path in sorted(queue_dir.glob("*.json")): + if not path.name.startswith("."): + yield path diff --git a/docs/Schema.md b/docs/Schema.md index 3f9fa87..e5f2d81 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -20,6 +20,7 @@ erDiagram BaseProfile ||--o| SlackUser : "extends" BaseProfile ||--o| MailingListProfile : "extends" BaseProfile ||--o| WG21PaperAuthorProfile : "extends" + BaseProfile ||--o| YoutubeSpeaker : "extends" Identity }o--|| BaseProfile : "has" TempProfileIdentityRelation ||--o{ BaseProfile : "has" TmpIdentity ||--o{ TempProfileIdentityRelation : "has" @@ -72,6 +73,12 @@ erDiagram datetime updated_at } + YoutubeSpeaker { + string display_name "IX" + datetime created_at + datetime updated_at + } + Identity { int id PK string display_name "IX" @@ -101,7 +108,7 @@ erDiagram **Note:** The **Email** table references BaseProfile via `base_profile_id` (FK to `BaseProfile.id`). One profile can have multiple email addresses; `is_primary` marks the primary email; `is_active` indicates whether the email is currently active. Other tables (e.g. MailingListMessage) can link to a profile via Email. **Note:** The `email` field is **not unique**; the same email address may appear in multiple rows (e.g. for different profiles or over time). -**Note:** The `type` field is a PostgreSQL enum (or equivalent) with values: `github`, `slack`, `mailing_list`, `wg21`. It identifies which extended table the row belongs to. +**Note:** The `type` field is a PostgreSQL enum (or equivalent) with values: `github`, `slack`, `mailing_list`, `wg21`, `discord`, `youtube`. It identifies which extended table the row belongs to. **Note:** In **GitHubAccount**, the `type` field is an enum with values: `user`, `organization`, `enterprise` (identifies whether the GitHub account is a user, organization, or enterprise). @@ -694,6 +701,94 @@ erDiagram --- +### 10. CPPA YouTube Script Tracker + +Stores YouTube video metadata, VTT transcripts, speaker links, and community tags for C++ conference talks (CppCon, C++Now, Meeting C++, etc.). + +- **`YouTubeChannel`** — publisher channel; `channel_id` is the primary key. +- **`YouTubeVideo`** — video metadata and transcript state; `video_id` is the primary key. +- **`YouTubeVideoSpeaker`** — M2M join between `YouTubeVideo` and `cppa_user_tracker.YoutubeSpeaker`. +- **`CppaTags`** — C++ community tag vocabulary (e.g. `concurrency`, `templates`, `modules`). +- **`YouTubeVideoTags`** — M2M join between `YouTubeVideo` and `CppaTags`. + +**Workspace layout:** + +``` +workspace/ +├── cppa_youtube_script_tracker/ +│ └── metadata/{video_id}.json # short-lived queue; moved to raw after DB persist +└── raw/ + └── cppa_youtube_script_tracker/ + ├── metadata/{video_id}.json # permanent archive + └── transcripts/{video_id}.en.vtt # permanent archive +``` + +```mermaid +erDiagram + direction TB + YoutubeSpeaker ||--o{ YouTubeVideoSpeaker : "appears_in" + YouTubeVideo ||--o{ YouTubeVideoSpeaker : "has" + YouTubeChannel ||--o{ YouTubeVideo : "hosts" + YouTubeVideo ||--o{ YouTubeVideoTags : "has" + CppaTags ||--o{ YouTubeVideoTags : "tagged_in" + + YouTubeChannel { + string channel_id PK + string channel_title + datetime created_at + datetime updated_at + } + + YouTubeVideo { + string video_id PK + string channel_id FK + string title + text description + datetime published_at "IX" + int duration_seconds + int view_count + int like_count + int comment_count + string search_term + bool has_transcript + string transcript_path + datetime scraped_at + datetime created_at + datetime updated_at + } + + YouTubeVideoSpeaker { + int id PK + string video_id FK + int speaker_id FK + datetime created_at + } + + CppaTags { + int id PK + string tag_name "UK IX" + } + + YouTubeVideoTags { + int id PK + string youtube_video_id FK + int cppa_tag_id FK + } + + YoutubeSpeaker { + int baseprofile_ptr_id PK + string display_name "IX" + } +``` + +**Note:** `YoutubeSpeaker` is defined in `cppa_user_tracker` (section 1) and extends `BaseProfile`. It is identified solely by `display_name` (same pattern as `MailingListProfile` and `WG21PaperAuthorProfile`). + +**Note:** `YouTubeVideoSpeaker` has a unique constraint on `(video, speaker)`. + +**Note:** `YouTubeVideoTags` has a unique constraint on `(youtube_video, cppa_tag)`. `CppaTags.tag_name` values are stored lowercase. + +--- + ## Appendix ### Appendix A: Table summary @@ -708,7 +803,7 @@ erDiagram | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | | **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | -| **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | +| **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | | **GitHubFile** | File in a repo (filename, repo_id, is_deleted). Base for file subtypes. | 2 | | **Language** | Reference: language name. | 2 | @@ -739,7 +834,7 @@ erDiagram | **BoostLibraryCategoryRelationship** | Library-category link. | 3 | | **BoostExternalRepository** | Extends GitHubRepository; adds boost_version, is_boost_embedded, is_boost_used. | 4 | | **BoostUsage** | External repo use of Boost (repo, boost_header_id, file_path_id, last_commit_date). | 4 | -| **BoostMissingHeaderTmp** | Temporary usage records when header_name is not yet in BoostFile/GitHubFile (usage_id→BoostUsage.id). | 4 | +| **BoostMissingHeaderTmp** | Temporary usage records when header_name is not yet in BoostFile/GitHubFile (usage_id→BoostUsage.id). | 4 | | **MailingListMessage** | Mailing list message (sender_id->MailingListProfile, msg_id, subject, content, list_name, sent_at). | 5 | | **SlackTeam** | Slack workspace (team_id, team_name). | 6 | | **SlackChannel** | Channel in a team (channel_id, name, type, creator_user_id). | 6 | @@ -753,42 +848,53 @@ erDiagram | **WebsiteWordCount** | Per-date, per-word count. | 8 | | **PineconeFailList** | Failed sync records (failed_id, type) for retry/audit. | 9 | | **PineconeSyncStatus** | Last sync per type (type, final_sync_at, created_at, updated_at); type = slack, mailing list, wg21, etc. | 9 | +| **YoutubeSpeaker** | Profile for YouTube speakers; extends BaseProfile. Identified by `display_name`. | 1, 10 | +| **YouTubeChannel** | Publisher channel; `channel_id` is PK (no auto-increment id). | 10 | +| **YouTubeVideo** | Video metadata, transcript state, and channel FK; `video_id` is PK (no auto-increment id). | 10 | +| **YouTubeVideoSpeaker** | M2M join between YouTubeVideo and YoutubeSpeaker (video_id, speaker_id). | 10 | +| **CppaTags** | C++ community tag vocabulary (tag_name, unique/lowercase). | 10 | +| **YouTubeVideoTags** | M2M join between YouTubeVideo and CppaTags (youtube_video_id, cppa_tag_id). | 10 | ### Appendix B: Relationship summary -| From | To | Relationship | -| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | -| Identity | BaseProfile | One identity has many profiles | -| BaseProfile | Email | One profile has many emails | -| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile | Extends (1:1 subtype) | -| TmpIdentity | TempProfileIdentityRelation | Has many (target) | -| TempProfileIdentityRelation | BaseProfile | Has many (base_profile_id) | -| GitHubAccount | GitHubRepository | Owns many | -| GitHubRepository | RepoLanguage, RepoLicense | Has many | -| Language | CreatedReposByLanguage | Has many yearly stats | -| GitHubRepository | BoostLibraryRepository, BoostExternalRepository | Extends (1:1 subtype) | -| GitHubRepository | GitCommit, Issue, PullRequest | Contains many | -| GitHubRepository | GitHubFile | Has many | -| GitHubFile | BoostFile | Extends (1:1 subtype) | -| GitHubFile | GitCommitFileChange | Changed in (many file changes) | -| GitCommit | GitCommitFileChange | Has many | -| Issue | IssueComment, IssueAssignee, IssueLabel | Has many | -| PullRequest | PullRequestReview, PullRequestComment, PullRequestAssignee, PullRequestLabel | Has many | -| GitHubAccount | GitCommit, Issue, IssueComment, IssueAssignee, PullRequest, PullRequestReview, PullRequestComment, PullRequestAssignee | Committer/creator/author/assignee/reviewer | -| BoostLibraryRepository | BoostLibrary | Has many | -| BoostLibrary | BoostFile, BoostDependency (client/dep), BoostLibraryVersion, DependencyChangeLog | Has many | -| BoostLibrary | BoostLibraryCategoryRelationship | Has many | -| BoostVersion | BoostDependency, BoostLibraryVersion | Version | -| BoostLibraryVersion | BoostLibraryRoleRelationship | Has many | -| GitHubAccount | BoostLibraryRoleRelationship | Role (maintainer/author) | -| BoostLibraryCategory | BoostLibraryCategoryRelationship | Category | -| BoostExternalRepository | BoostUsage | Has many | -| BoostUsage | BoostFile, GitHubFile | References (boost header, file path) | -| BoostUsage | BoostMissingHeaderTmp | Has many (temporary missing-header records) | -| MailingListProfile | MailingListMessage | Sender (has many messages) | -| SlackTeam | SlackChannel | Has many | -| SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | -| SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | -| SlackChannel | SlackUser | Creator (many-to-one) | -| WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | -| WG21Paper | WG21PaperAuthor | Has many authors | +| From | To | Relationship | +| --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------- | +| Identity | BaseProfile | One identity has many profiles | +| BaseProfile | Email | One profile has many emails | +| BaseProfile | GitHubAccount, SlackUser, MailingListProfile, WG21PaperAuthorProfile | Extends (1:1 subtype) | +| TmpIdentity | TempProfileIdentityRelation | Has many (target) | +| TempProfileIdentityRelation | BaseProfile | Has many (base_profile_id) | +| GitHubAccount | GitHubRepository | Owns many | +| GitHubRepository | RepoLanguage, RepoLicense | Has many | +| Language | CreatedReposByLanguage | Has many yearly stats | +| GitHubRepository | BoostLibraryRepository, BoostExternalRepository | Extends (1:1 subtype) | +| GitHubRepository | GitCommit, Issue, PullRequest | Contains many | +| GitHubRepository | GitHubFile | Has many | +| GitHubFile | BoostFile | Extends (1:1 subtype) | +| GitHubFile | GitCommitFileChange | Changed in (many file changes) | +| GitCommit | GitCommitFileChange | Has many | +| Issue | IssueComment, IssueAssignee, IssueLabel | Has many | +| PullRequest | PullRequestReview, PullRequestComment, PullRequestAssignee, PullRequestLabel | Has many | +| GitHubAccount | GitCommit, Issue, IssueComment, IssueAssignee, PullRequest, PullRequestReview, PullRequestComment, PullRequestAssignee | Committer/creator/author/assignee/reviewer | +| BoostLibraryRepository | BoostLibrary | Has many | +| BoostLibrary | BoostFile, BoostDependency (client/dep), BoostLibraryVersion, DependencyChangeLog | Has many | +| BoostLibrary | BoostLibraryCategoryRelationship | Has many | +| BoostVersion | BoostDependency, BoostLibraryVersion | Version | +| BoostLibraryVersion | BoostLibraryRoleRelationship | Has many | +| GitHubAccount | BoostLibraryRoleRelationship | Role (maintainer/author) | +| BoostLibraryCategory | BoostLibraryCategoryRelationship | Category | +| BoostExternalRepository | BoostUsage | Has many | +| BoostUsage | BoostFile, GitHubFile | References (boost header, file path) | +| BoostUsage | BoostMissingHeaderTmp | Has many (temporary missing-header records) | +| MailingListProfile | MailingListMessage | Sender (has many messages) | +| SlackTeam | SlackChannel | Has many | +| SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | +| SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | +| SlackChannel | SlackUser | Creator (many-to-one) | +| WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | +| WG21Paper | WG21PaperAuthor | Has many authors | +| YoutubeSpeaker | YouTubeVideoSpeaker | Appears in (many videos) | +| YouTubeChannel | YouTubeVideo | Hosts many videos | +| YouTubeVideo | YouTubeVideoSpeaker | Has many speakers | +| YouTubeVideo | YouTubeVideoTags | Has many tags | +| CppaTags | YouTubeVideoTags | Tagged in many videos | diff --git a/docs/service_api/README.md b/docs/service_api/README.md index a36e39d..33300f6 100644 --- a/docs/service_api/README.md +++ b/docs/service_api/README.md @@ -11,6 +11,7 @@ Index of all app service modules. All writes to app models must go through the s | [boost_library_tracker.services](boost_library_tracker.md) | boost_library_tracker | Boost libraries, versions, dependencies, categories, maintainers/authors. | | [boost_usage_tracker.services](boost_usage_tracker.md) | boost_usage_tracker | External repos, Boost usage, missing-header tmp. | | [discord_activity_tracker.services](discord_activity_tracker.md) | discord_activity_tracker | Servers, channels, messages, reactions (user profiles in cppa_user_tracker). | +| [cppa_youtube_script_tracker.services](cppa_youtube_script_tracker.md) | cppa_youtube_script_tracker | YouTube channels, videos, transcript state, and speaker links for C++ conference talks. | --- @@ -21,5 +22,6 @@ Index of all app service modules. All writes to app models must go through the s - **boost_library_tracker** – Get-or-create BoostLibraryRepository, BoostLibrary, BoostVersion, BoostLibraryVersion; add dependencies, categories, and role relationships. - **boost_usage_tracker** – Get-or-create BoostExternalRepository, create/update BoostUsage, record missing headers (BoostMissingHeaderTmp). - **discord_activity_tracker** – Get-or-create DiscordServer, DiscordChannel; create/update DiscordMessage, DiscordReaction. Discord user profiles in cppa_user_tracker. +- **cppa_youtube_script_tracker** – Get-or-create YouTubeChannel, YouTubeVideo; update transcript state; link speakers to videos. Speaker profiles (`YoutubeSpeaker`) in cppa_user_tracker. See [Contributing.md](../Contributing.md) for the rule that all writes go through the service layer. diff --git a/docs/service_api/cppa_youtube_script_tracker.md b/docs/service_api/cppa_youtube_script_tracker.md new file mode 100644 index 0000000..3d52dd0 --- /dev/null +++ b/docs/service_api/cppa_youtube_script_tracker.md @@ -0,0 +1,144 @@ +# cppa_youtube_script_tracker — Service API + +**Module path:** `cppa_youtube_script_tracker.services` +**Description:** YouTube channel metadata, video metadata, transcript state, and speaker links for C++ conference talks. Single place for all writes to `cppa_youtube_script_tracker` models. Speaker profiles live in `cppa_user_tracker.YoutubeSpeaker`. + +**Type notation:** Model types refer to `cppa_youtube_script_tracker.models` unless noted. `YoutubeSpeaker` refers to `cppa_user_tracker.models.YoutubeSpeaker`. + +--- + +## YouTubeChannel + +| Function | Parameter types | Return type | Description | +| ----------------------- | ------------------------------------------------ | ---------------- | ------------------------------------------------------------------------------- | +| `get_or_create_channel` | `channel_id: str`, `channel_title: str = ""` | `YouTubeChannel` | Get or create channel by `channel_id`; updates `channel_title` if it has changed. | + +--- + +## YouTubeVideo + +| Function | Parameter types | Return type | Description | +| ---------------------- | ---------------------------------------------------------------------------------- | ------------------------ | ----------------------------------------------------------------------------------------------- | +| `get_or_create_video` | `video_id: str`, `channel: YouTubeChannel \| None`, `metadata_dict: dict` | `tuple[YouTubeVideo, bool]` | Get or create video by `video_id`. Raises `ValueError` if `video_id` is empty. | +| `update_video_transcript` | `video: YouTubeVideo`, `transcript_path: str` | `YouTubeVideo` | Set `has_transcript=True` and `transcript_path` on the video; saves `update_fields`. | + +`metadata_dict` accepted keys: + +| Key | Type | Notes | +| ------------------ | ----------------- | -------------------------------------------------- | +| `title` | str | | +| `description` | str | | +| `published_at` | datetime or str | ISO string is parsed via `parse_datetime` | +| `duration_seconds` | int | | +| `view_count` | int \| None | | +| `like_count` | int \| None | | +| `comment_count` | int \| None | | +| `tags` | list | | +| `search_term` | str | Search term used to discover the video | +| `scraped_at` | datetime or str | ISO string is parsed via `parse_datetime` | + +--- + +## YouTubeVideoSpeaker + +| Function | Parameter types | Return type | Description | +| --------------------- | --------------------------------------------- | -------------------- | -------------------------------------------------------- | +| `link_speaker_to_video` | `video: YouTubeVideo`, `speaker: YoutubeSpeaker` | `YouTubeVideoSpeaker` | Get-or-create M2M link between a video and a speaker. | + +--- + +## YoutubeSpeaker (in cppa_user_tracker) + +| Function | Parameter types | Return type | Description | +| ------------------------------- | -------------------------------------------------- | ---------------------------- | -------------------------------------------------------------------------------- | +| `get_or_create_youtube_speaker` | `display_name: str`, `identity: Identity \| None = None` | `tuple[YoutubeSpeaker, bool]` | Get or create a speaker by `display_name`. Raises `ValueError` if name is empty. | + +**Module path:** `cppa_user_tracker.services` + +--- + +## Preprocessor + +**Module path:** `cppa_youtube_script_tracker.preprocessor` + +| Function | Parameter types | Return type | Description | +| -------------------------------- | ------------------------------------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------- | +| `preprocess_youtube_for_pinecone` | `failed_ids: list[str]`, `final_sync_at: datetime \| None` | `tuple[list[dict], bool]` | Build Pinecone sync documents for YouTube videos. Returns `(docs, is_chunked=False)`. | + +Each document dict has: +- `content` — Title, speakers, channel, published date, description, and transcript text (if available). +- `metadata["doc_id"]` — `"youtube-{video_id}"`. +- `metadata["ids"]` — DB primary key of the `YouTubeVideo` row (for retry tracking). +- `metadata["type"]` — `"youtube"`. +- `metadata["url"]` — `"https://www.youtube.com/watch?v={video_id}"`. +- `metadata["title"]`, `metadata["author"]` (comma-separated speaker names), `metadata["channel"]`, `metadata["timestamp"]` (Unix timestamp), `metadata["has_transcript"]`. + +--- + +## Workspace helpers + +**Module path:** `cppa_youtube_script_tracker.workspace` + +| Function | Return type | Description | +| ----------------------- | ----------- | --------------------------------------------------------------------------- | +| `get_workspace_root()` | `Path` | `workspace/cppa_youtube_script_tracker/` | +| `get_raw_dir()` | `Path` | `workspace/raw/cppa_youtube_script_tracker/` (permanent JSON archive) | +| `get_transcripts_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/transcripts/` (permanent VTT archive) | +| `get_queue_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/queue/` (short-lived; deleted after persist) | +| `get_raw_json_path(video_id)` | `Path` | Raw JSON archive path for a video. | +| `get_queue_json_path(video_id)` | `Path` | Queue JSON path for a video. | +| `get_transcript_path(video_id, lang="en")` | `Path` | VTT path for a video. | +| `iter_queue_jsons()` | `Iterator[Path]` | Yield all `*.json` files in the queue directory. | + +--- + +## Fetcher + +**Module path:** `cppa_youtube_script_tracker.fetcher` + +| Function | Parameter types | Return type | Description | +| --------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------- | +| `fetch_videos` | `published_after: datetime`, `published_before: datetime`, `channel_title: str \| None = None`, `skip_video_ids: set[str] \| None = None`, `min_duration_seconds: int = 0` | `list[dict]` | Fetch video metadata from YouTube Data API v3 for the given time window. Returns normalised metadata dicts. | + +Each returned dict contains the following keys: + +| Key | Type | Notes | +| ------------------ | ----------- | ----------------------------------------------------------------- | +| `video_id` | str | YouTube video ID | +| `title` | str | | +| `description` | str | | +| `channel_id` | str | | +| `channel_title` | str | | +| `published_at` | str | ISO 8601 datetime string from API | +| `duration_seconds` | int | Parsed from ISO 8601 duration (e.g. `PT1H2M10S`) | +| `view_count` | int \| None | | +| `like_count` | int \| None | | +| `comment_count` | int \| None | | +| `tags` | list | | +| `search_term` | str | Query used to discover the video | +| `scraped_at` | str | ISO 8601 datetime when the API call was made | + +**`channel_title` behaviour:** If `channel_title` matches a key in the `C_PLUS_PLUS_CHANNELS` dict, the API call is filtered by that channel's ID. If `channel_title` is unrecognised, a keyword search by name is used. If `channel_title` is `None`, all known C++ channels are searched. + +**Requires:** `YOUTUBE_API_KEY` setting. Raises `ValueError` if missing. Raises `ImportError` if `google-api-python-client` is not installed. + +--- + +## Transcript downloader + +**Module path:** `cppa_youtube_script_tracker.transcript` + +| Function | Parameter types | Return type | Description | +| --------------- | ----------------------------------------------------------------------------- | --------------- | ---------------------------------------------------------------------------------------------------------------- | +| `download_vtt` | `video_id: str`, `output_dir: Path`, `cookies_file: str \| None = None` | `Path \| None` | Download English VTT subtitles for `video_id` into `output_dir`. Returns path to the `.vtt` file, or `None` if not found. | + +Tries manual captions first, then auto-generated (`writeautomaticsub`). The output file is written as `{video_id}.en.vtt`; falls back to any `{video_id}*.vtt` file in `output_dir` if the expected name is not present. + +**Requires:** `yt-dlp`. Raises `ImportError` if not installed. + +--- + +## Related docs + +- [Schema.md](../Schema.md) – Section 10: CPPA YouTube Script Tracker. +- [service_api/README.md](README.md) – Service API index. diff --git a/requirements.txt b/requirements.txt index 1b3f84d..62ad9d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,7 @@ redis>=5.0 slack-bolt>=1.18 pytz>=2024.1 selenium>=4.35 + +# cppa_youtube_script_tracker app (YouTube Data API v3 + VTT transcript download) +google-api-python-client>=2.100 +yt-dlp==2026.2.4 diff --git a/workflow/management/commands/run_all_collectors.py b/workflow/management/commands/run_all_collectors.py index 25b262b..9d887ea 100644 --- a/workflow/management/commands/run_all_collectors.py +++ b/workflow/management/commands/run_all_collectors.py @@ -20,6 +20,7 @@ "run_boost_usage_tracker", "run_boost_mailing_list_tracker", "run_discord_exporter", + "run_cppa_youtube_script_tracker", ] From c246241b88f298a4aa2b4562b9a5bad05ef9945f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 10:01:23 -0700 Subject: [PATCH 16/76] Fix: OpenRouter retries, CSV year from parsed date, placeholder race recovery #24 --- .../converters/openai_converter.py | 148 ++++++++++++------ .../commands/import_wg21_metadata_from_csv.py | 13 +- wg21_paper_tracker/services.py | 84 ++++++---- 3 files changed, 155 insertions(+), 90 deletions(-) diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 7edd988..66c08b0 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -10,6 +10,7 @@ import os import shutil import tempfile +import time from pathlib import Path from typing import Optional @@ -148,56 +149,105 @@ def convert_page_with_openai( logger.error("OpenRouter API key is not set") return None - try: - logger.info(f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter") - - url = f"{OPENROUTER_BASE_URL}/chat/completions" - headers = { - "Authorization": f"Bearer {OPENROUTER_API_KEY}", - "Content-Type": "application/json", - } - - payload = { - "model": OPENROUTER_MODEL, - "messages": [ - { - "role": "system", - "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, - }, - ], - }, - ], - "max_tokens": 4000, - } - - response = requests.post(url, json=payload, headers=headers, timeout=120) - response.raise_for_status() - - result = response.json() - markdown_content = result["choices"][0]["message"]["content"] - - logger.info(f"Successfully converted page {page_num} with OpenAI/OpenRouter") - return markdown_content + url = f"{OPENROUTER_BASE_URL}/chat/completions" + headers = { + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Content-Type": "application/json", + } + + payload = { + "model": OPENROUTER_MODEL, + "messages": [ + { + "role": "system", + "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, + }, + ], + }, + ], + "max_tokens": 4000, + } + + max_attempts = 3 # initial + 2 retries + retry_delays = [1, 2] # exponential backoff in seconds + + for attempt in range(max_attempts): + try: + logger.info( + f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter" + + (f" (attempt {attempt + 1}/{max_attempts})" if attempt > 0 else "") + ) - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None + response = requests.post(url, json=payload, headers=headers, timeout=120) + response.raise_for_status() + + result = response.json() + markdown_content = result["choices"][0]["message"]["content"] + + logger.info( + f"Successfully converted page {page_num} with OpenAI/OpenRouter" + ) + return markdown_content + + except ( + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + ) as e: + retryable = attempt < max_attempts - 1 + if retryable: + delay = retry_delays[attempt] + logger.warning( + f"Transient error on page {page_num} ({type(e).__name__}), " + f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" + ) + time.sleep(delay) + else: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + except requests.exceptions.HTTPError as e: + status_code = e.response.status_code if e.response is not None else None + retryable = ( + attempt < max_attempts - 1 + and status_code is not None + and (status_code == 429 or 500 <= status_code < 600) + ) + if retryable: + delay = retry_delays[attempt] + logger.warning( + f"HTTP {status_code} on page {page_num}, " + f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" + ) + time.sleep(delay) + else: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + return None def convert_with_openai(pdf_path: Path) -> Optional[str]: diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index e00d64b..feff138 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -135,7 +135,6 @@ def handle(self, *args, **options): stats["rows"] += 1 paper_id = (row.get("paper_id", "") or "").strip().lower() url = row.get("url", "") - document_date = row.get("date", "") if not paper_id or not url: stats["skipped"] += 1 @@ -149,14 +148,14 @@ def handle(self, *args, **options): mailing_date, mailing_title = _resolve_mailing_date( row.get("mailing_date", "") ) - year_str = ( - mailing_date[:4] - if mailing_date and MAILING_DATE_PATTERN.match(mailing_date) - else (document_date[:4] if document_date else None) - ) - year = int(year_str) if year_str and year_str.isdigit() else None try: document_date = _parse_document_date(row.get("date", "")) + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): + year = int(mailing_date[:4]) + elif document_date is not None: + year = document_date.year + else: + year = None title = row.get("title", "") or paper_id subgroup = row.get("subgroup", "") author_names = _author_names_from_csv(row.get("author", "")) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index b0be567..24c7ba6 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional -from django.db import transaction +from django.db import IntegrityError, transaction from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor @@ -36,7 +36,6 @@ def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, b return mailing, created -@transaction.atomic def get_or_create_paper( paper_id: str, url: str, @@ -77,28 +76,49 @@ def _update_paper(paper: WG21Paper) -> bool: paper.save() return updated - if year_val > 0: - # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year - paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() - if paper: - _update_paper(paper) - created = False - else: - placeholder = WG21Paper.objects.filter(paper_id=paper_id, year=0).first() - if placeholder: - placeholder.url = url - placeholder.title = title - placeholder.document_date = document_date - placeholder.mailing = mailing - placeholder.subgroup = subgroup - placeholder.year = year_val - placeholder.save() - paper = placeholder - created = False + try: + with transaction.atomic(): + if year_val > 0: + # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year + paper = WG21Paper.objects.filter( + paper_id=paper_id, year=year_val + ).first() + if paper: + _update_paper(paper) + created = False + else: + placeholder = WG21Paper.objects.filter( + paper_id=paper_id, year=0 + ).first() + if placeholder: + try: + placeholder.url = url + placeholder.title = title + placeholder.document_date = document_date + placeholder.mailing = mailing + placeholder.subgroup = subgroup + placeholder.year = year_val + placeholder.save() + paper = placeholder + created = False + except IntegrityError: + raise # Roll back this transaction; recovery runs below + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=year_val, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) else: paper, created = WG21Paper.objects.get_or_create( paper_id=paper_id, - year=year_val, + year=0, defaults={ "url": url, "title": title, @@ -107,20 +127,16 @@ def _update_paper(paper: WG21Paper) -> bool: "subgroup": subgroup, }, ) - else: - paper, created = WG21Paper.objects.get_or_create( - paper_id=paper_id, - year=0, - defaults={ - "url": url, - "title": title, - "document_date": document_date, - "mailing": mailing, - "subgroup": subgroup, - }, - ) - if not created: + if not created: + _update_paper(paper) + except IntegrityError: + # Placeholder promotion hit (paper_id, year_val) unique constraint; fetch and update canonical row + with transaction.atomic(): + paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() + if not paper: + raise _update_paper(paper) + created = False if author_names: emails = author_emails or [] From 505125802d61d0041b11320a41d5bb587be632b9 Mon Sep 17 00:00:00 2001 From: zho Date: Thu, 12 Mar 2026 01:09:48 +0800 Subject: [PATCH 17/76] #38-fixed lint errors --- .env.example | 1 - config/settings.py | 1 - cppa_youtube_script_tracker/__init__.py | 1 - cppa_youtube_script_tracker/admin.py | 14 ++++++++++++-- .../management/__init__.py | 1 - .../management/commands/__init__.py | 1 - cppa_youtube_script_tracker/preprocessor.py | 15 ++++++++++++--- cppa_youtube_script_tracker/services.py | 9 ++++++++- cppa_youtube_script_tracker/tests/__init__.py | 1 - 9 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.env.example b/.env.example index 008958a..c5a31af 100644 --- a/.env.example +++ b/.env.example @@ -158,4 +158,3 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Earliest published_at to use when DB is empty (ISO 8601, e.g. 2015-01-01T00:00:00Z) # YOUTUBE_DEFAULT_PUBLISHED_AFTER=2015-01-01T00:00:00Z - diff --git a/config/settings.py b/config/settings.py index 8c9942f..dd3e531 100644 --- a/config/settings.py +++ b/config/settings.py @@ -318,4 +318,3 @@ YOUTUBE_DEFAULT_PUBLISHED_AFTER = ( env("YOUTUBE_DEFAULT_PUBLISHED_AFTER", default="") or "" ).strip() - diff --git a/cppa_youtube_script_tracker/__init__.py b/cppa_youtube_script_tracker/__init__.py index 8b13789..e69de29 100644 --- a/cppa_youtube_script_tracker/__init__.py +++ b/cppa_youtube_script_tracker/__init__.py @@ -1 +0,0 @@ - diff --git a/cppa_youtube_script_tracker/admin.py b/cppa_youtube_script_tracker/admin.py index e96301e..6796678 100644 --- a/cppa_youtube_script_tracker/admin.py +++ b/cppa_youtube_script_tracker/admin.py @@ -1,7 +1,13 @@ from django.contrib import admin from django.contrib.admin import ModelAdmin -from .models import CppaTags, YouTubeChannel, YouTubeVideo, YouTubeVideoSpeaker, YouTubeVideoTags +from .models import ( + CppaTags, + YouTubeChannel, + YouTubeVideo, + YouTubeVideoSpeaker, + YouTubeVideoTags, +) @admin.register(YouTubeChannel) @@ -43,4 +49,8 @@ class CppaTagsAdmin(ModelAdmin): class YouTubeVideoTagsAdmin(ModelAdmin): list_display = ("id", "youtube_video", "cppa_tag") raw_id_fields = ("youtube_video", "cppa_tag") - search_fields = ("youtube_video__video_id", "youtube_video__title", "cppa_tag__tag_name") + search_fields = ( + "youtube_video__video_id", + "youtube_video__title", + "cppa_tag__tag_name", + ) diff --git a/cppa_youtube_script_tracker/management/__init__.py b/cppa_youtube_script_tracker/management/__init__.py index 8b13789..e69de29 100644 --- a/cppa_youtube_script_tracker/management/__init__.py +++ b/cppa_youtube_script_tracker/management/__init__.py @@ -1 +0,0 @@ - diff --git a/cppa_youtube_script_tracker/management/commands/__init__.py b/cppa_youtube_script_tracker/management/commands/__init__.py index 8b13789..e69de29 100644 --- a/cppa_youtube_script_tracker/management/commands/__init__.py +++ b/cppa_youtube_script_tracker/management/commands/__init__.py @@ -1 +0,0 @@ - diff --git a/cppa_youtube_script_tracker/preprocessor.py b/cppa_youtube_script_tracker/preprocessor.py index 41983aa..cb3b233 100644 --- a/cppa_youtube_script_tracker/preprocessor.py +++ b/cppa_youtube_script_tracker/preprocessor.py @@ -98,7 +98,9 @@ def _build_document_content(video: YouTubeVideo, speaker_names: list[str]) -> st return "\n".join(parts).strip() -def _build_candidate_queryset(normalized_failed: list[str], final_sync_at: datetime | None): +def _build_candidate_queryset( + normalized_failed: list[str], final_sync_at: datetime | None +): """Return the ORM queryset of candidates to preprocess.""" queryset = YouTubeVideo._default_manager.select_related("channel").prefetch_related( # type: ignore[attr-defined] "video_speakers__speaker" @@ -113,7 +115,9 @@ def _build_candidate_queryset(normalized_failed: list[str], final_sync_at: datet return queryset.filter(criteria).order_by("id") -def _build_video_metadata(video: YouTubeVideo, speaker_names: list[str]) -> dict[str, Any]: +def _build_video_metadata( + video: YouTubeVideo, speaker_names: list[str] +) -> dict[str, Any]: """Build the Pinecone metadata dict for one video.""" channel_title = (video.channel.channel_title if video.channel else "") or "" return { @@ -161,6 +165,11 @@ def preprocess_youtube_for_pinecone( if not content: continue - docs.append({"content": content, "metadata": _build_video_metadata(video, speaker_names)}) + docs.append( + { + "content": content, + "metadata": _build_video_metadata(video, speaker_names), + } + ) return docs, False diff --git a/cppa_youtube_script_tracker/services.py b/cppa_youtube_script_tracker/services.py index 0cacba1..ce4e569 100644 --- a/cppa_youtube_script_tracker/services.py +++ b/cppa_youtube_script_tracker/services.py @@ -12,13 +12,20 @@ from typing import Any, Optional -from .models import CppaTags, YouTubeChannel, YouTubeVideo, YouTubeVideoSpeaker, YouTubeVideoTags +from .models import ( + CppaTags, + YouTubeChannel, + YouTubeVideo, + YouTubeVideoSpeaker, + YouTubeVideoTags, +) def _parse_dt_field(value: Any) -> Any: """Parse a datetime string field; returns datetime, None, or the original value.""" if isinstance(value, str) and value: from django.utils.dateparse import parse_datetime as _pd + return _pd(value) return value diff --git a/cppa_youtube_script_tracker/tests/__init__.py b/cppa_youtube_script_tracker/tests/__init__.py index 8b13789..e69de29 100644 --- a/cppa_youtube_script_tracker/tests/__init__.py +++ b/cppa_youtube_script_tracker/tests/__init__.py @@ -1 +0,0 @@ - From 17827f099640814723a1fccd6d109e5e72a09b8c Mon Sep 17 00:00:00 2001 From: zho Date: Thu, 12 Mar 2026 01:29:43 +0800 Subject: [PATCH 18/76] #38-fix a bug --- .env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.example b/.env.example index c5a31af..4598a53 100644 --- a/.env.example +++ b/.env.example @@ -151,7 +151,7 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # YouTube (cppa_youtube_script_tracker) # ============================================================================= # YouTube Data API v3 key (console.cloud.google.com → APIs & Services → Credentials) -# YOUTUBE_API_KEY=wZLObIrVHpcPZGj60P8GXA... +# YOUTUBE_API_KEY=... # Pinecone namespace for YouTube video/transcript sync (default: youtube-scripts) # YOUTUBE_PINECONE_NAMESPACE=youtube-scripts From 5dce586089ca57ef901a1feede84ca403c65c7ed Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 00:30:03 +0800 Subject: [PATCH 19/76] #38-updated youtube_tracker by adding search term, processing QuotaExceededError and most importantly introducing rich name extractor --- .../0007_youtubespeaker_external_id.py | 67 ++++++ cppa_user_tracker/models.py | 7 +- cppa_user_tracker/services.py | 23 +- cppa_youtube_script_tracker/fetcher.py | 153 ++++++++++++-- .../run_cppa_youtube_script_tracker.py | 200 ++++++++++++------ cppa_youtube_script_tracker/preprocessor.py | 6 +- cppa_youtube_script_tracker/services.py | 18 ++ cppa_youtube_script_tracker/transcript.py | 4 +- cppa_youtube_script_tracker/utils.py | 199 +++++++++++++++++ .../cppa_youtube_script_tracker.md | 12 +- 10 files changed, 588 insertions(+), 101 deletions(-) create mode 100644 cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py create mode 100644 cppa_youtube_script_tracker/utils.py diff --git a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py new file mode 100644 index 0000000..61f0304 --- /dev/null +++ b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py @@ -0,0 +1,67 @@ +from django.db import migrations, models + + +def _normalize_name(value: str) -> str: + value = (value or "").strip().lower() + chars = [] + for ch in value: + if ch.isalnum(): + chars.append(ch) + else: + chars.append("_") + slug = "".join(chars).strip("_") + while "__" in slug: + slug = slug.replace("__", "_") + return slug or "unknown" + + +def populate_external_id(apps, schema_editor): + YoutubeSpeaker = apps.get_model("cppa_user_tracker", "YoutubeSpeaker") + + used = set( + YoutubeSpeaker.objects.exclude(external_id__isnull=True) + .exclude(external_id="") + .values_list("external_id", flat=True) + ) + + for speaker in YoutubeSpeaker.objects.all().order_by("baseprofile_ptr_id"): + if speaker.external_id: + continue + base = _normalize_name(speaker.display_name) + candidate = f"legacy:{base}" + if candidate in used: + candidate = f"{candidate}:{speaker.baseprofile_ptr_id}" + speaker.external_id = candidate + speaker.save(update_fields=["external_id"]) + used.add(candidate) + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0006_alter_slackuser_slack_user_id"), + ] + + operations = [ + # Defensive cleanup for previously failed local runs. + migrations.RunSQL( + sql=( + "DROP INDEX IF EXISTS " + "cppa_user_tracker_youtubespeaker_external_id_8b44bffb_like;" + "DROP INDEX IF EXISTS " + "cppa_user_tracker_youtubespeaker_external_id_8b44bffb;" + ), + reverse_sql=migrations.RunSQL.noop, + ), + migrations.AddField( + model_name="youtubespeaker", + name="external_id", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.RunPython(populate_external_id, migrations.RunPython.noop), + migrations.AlterField( + model_name="youtubespeaker", + name="external_id", + field=models.CharField(max_length=255, unique=True), + ), + ] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 3e3da6e..17b422c 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -187,12 +187,17 @@ def save(self, *args, **kwargs): class YoutubeSpeaker(BaseProfile): - """YouTube speaker profile. Identified by display_name.""" + """YouTube speaker profile. + + Uses external_id as canonical identifier (stable across updates). display_name is + a human-readable field and is not used as the identity key. + """ def save(self, *args, **kwargs): self.type = ProfileType.YOUTUBE super().save(*args, **kwargs) + external_id = models.CharField(max_length=255, unique=True) display_name = models.CharField(max_length=255, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 0e420e1..d73bcf1 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -354,19 +354,26 @@ def get_or_create_discord_profile( def get_or_create_youtube_speaker( - display_name: str, + external_id: str, + display_name: str = "", identity: Optional[Identity] = None, ) -> tuple[YoutubeSpeaker, bool]: - """Get or create a YoutubeSpeaker by display_name. Returns (speaker, created). + """Get or create a YoutubeSpeaker by external_id. Returns (speaker, created). - Looks up by display_name. On creation, sets identity if provided. - Raises ValueError if display_name is empty. + Looks up by external_id. On creation, sets identity/display_name if provided. + If the record already exists and a non-empty display_name is provided, updates + display_name when changed. + Raises ValueError if external_id is empty. """ + external_id_val = (external_id or "").strip() display_name_val = (display_name or "").strip() - if not display_name_val: - raise ValueError("display_name must not be empty.") + if not external_id_val: + raise ValueError("external_id must not be empty.") speaker, created = YoutubeSpeaker.objects.get_or_create( - display_name=display_name_val, - defaults={"identity": identity}, + external_id=external_id_val, + defaults={"display_name": display_name_val, "identity": identity}, ) + if not created and display_name_val and speaker.display_name != display_name_val: + speaker.display_name = display_name_val + speaker.save(update_fields=["display_name", "updated_at"]) return speaker, created diff --git a/cppa_youtube_script_tracker/fetcher.py b/cppa_youtube_script_tracker/fetcher.py index 60e43c6..a55c0df 100644 --- a/cppa_youtube_script_tracker/fetcher.py +++ b/cppa_youtube_script_tracker/fetcher.py @@ -27,8 +27,46 @@ "Bo Qian": "UCEqgmyWChwmqyRdmnsS24Zw", } +_CHANNEL_FOCUSED_TERMS: list[str] = [ + "C++", +] + +# Search-term based discovery (global searches, not tied to one channel ID) +_GLOBAL_SEARCH_TERMS: list[str] = [ + "C++ programming", + "C++ tutorial", + "C++ advanced", + "modern C++", + "C++20", + "C++23", + "C++ templates", + "C++ STL", + "C++ best practices", + "C++ performance", + "Boost C++", +] + +# Famous-figure focused discovery terms. +_FAMOUS_FIGURE_TERMS: list[str] = [ + "Bjarne Stroustrup C++", + "Herb Sutter C++", + "Scott Meyers C++", + "Andrei Alexandrescu C++", + "Nicolai Josuttis C++", + "Chandler Carruth C++", + "Kate Gregory C++", + "Jason Turner C++", + "Sean Parent C++", + "Jonathan Boccara C++", +] + _MAX_RESULTS_PER_PAGE = 50 _DELAY_SECONDS = 0.5 +_DEFAULT_MAX_QUERY_PAIRS = 30 + + +class QuotaExceededError(RuntimeError): + """Raised when YouTube Data API quota has been exhausted.""" def _get_api_key() -> str: @@ -55,6 +93,25 @@ def _parse_duration_iso(duration_iso: str) -> int: ) +def _is_quota_exceeded_error(exc: Exception) -> bool: + text = str(exc).lower() + return "quotaexceeded" in text or "youtube.quota" in text + + +def _get_max_query_pairs() -> int: + """ + Return max number of query pairs for one run. + + Configure with `YOUTUBE_MAX_QUERY_PAIRS` in Django settings/.env. + """ + raw = getattr(settings, "YOUTUBE_MAX_QUERY_PAIRS", _DEFAULT_MAX_QUERY_PAIRS) + try: + value = int(raw) + except (TypeError, ValueError): + value = _DEFAULT_MAX_QUERY_PAIRS + return max(1, value) + + def _format_video_data( video_data: dict[str, Any], search_term: str = "" ) -> dict[str, Any]: @@ -91,7 +148,31 @@ def _to_rfc3339(dt: datetime) -> str: def _build_queries(channel_title: Optional[str]) -> list[tuple[str, Optional[str]]]: - """Return list of (query_text, channel_id_or_None) pairs to iterate over.""" + """Return list of (query_text, channel_id_or_None) pairs to iterate over. + + Strategy: + - If channel_title is specified: + - Known channel ID: run several C++ terms scoped to that channel. + - Unknown channel: run keyword searches with that channel title. + - Otherwise: + - Run channel-scoped queries for known channels. + - Run global term-based discovery queries. + - Run famous-figure discovery queries. + """ + + def _dedupe_pairs( + pairs: list[tuple[str, Optional[str]]], + ) -> list[tuple[str, Optional[str]]]: + seen: set[tuple[str, Optional[str]]] = set() + out: list[tuple[str, Optional[str]]] = [] + for query_text, ch_id in pairs: + key = (query_text.strip().casefold(), ch_id) + if key in seen: + continue + seen.add(key) + out.append((query_text, ch_id)) + return out + if channel_title: ch_id = C_PLUS_PLUS_CHANNELS.get(channel_title) if not ch_id: @@ -100,9 +181,18 @@ def _build_queries(channel_title: Optional[str]) -> list[tuple[str, Optional[str "falling back to keyword search", channel_title, ) - return [(channel_title, None)] - return [("C++", ch_id)] - return [("C++", ch_id) for ch_id in C_PLUS_PLUS_CHANNELS.values()] + return _dedupe_pairs( + [(channel_title, None), (f"{channel_title} C++", None)] + ) + return _dedupe_pairs([(term, ch_id) for term in _CHANNEL_FOCUSED_TERMS]) + + pairs: list[tuple[str, Optional[str]]] = [] + for ch_id in C_PLUS_PLUS_CHANNELS.values(): + pairs.extend((term, ch_id) for term in _CHANNEL_FOCUSED_TERMS) + + pairs.extend((term, None) for term in _FAMOUS_FIGURE_TERMS) + pairs.extend((term, None) for term in _GLOBAL_SEARCH_TERMS) + return _dedupe_pairs(pairs) def _fetch_search_page( @@ -113,7 +203,10 @@ def _fetch_search_page( before_str: str, page_token: Optional[str], ) -> Optional[dict[str, Any]]: - """Execute one search().list() call; return the response or None on error.""" + """Execute one search().list() call; return the response or None on error. + + Raises QuotaExceededError when API quota is exhausted. + """ params: dict[str, Any] = { "q": query_text, "part": "id,snippet", @@ -131,12 +224,17 @@ def _fetch_search_page( time.sleep(_DELAY_SECONDS) return youtube.search().list(**params).execute() # type: ignore[union-attr] except Exception as exc: # pylint: disable=broad-exception-caught + if _is_quota_exceeded_error(exc): + raise QuotaExceededError("YouTube API quota exceeded.") from exc logger.error("fetch_videos: search API error: %s", exc) return None def _fetch_video_details(youtube: Any, video_ids: list[str]) -> list[dict[str, Any]]: - """Execute one videos().list() call; return items or empty list on error.""" + """Execute one videos().list() call; return items or empty list on error. + + Raises QuotaExceededError when API quota is exhausted. + """ try: time.sleep(_DELAY_SECONDS) resp = ( @@ -146,6 +244,8 @@ def _fetch_video_details(youtube: Any, video_ids: list[str]) -> list[dict[str, A ) return resp.get("items", []) except Exception as exc: # pylint: disable=broad-exception-caught + if _is_quota_exceeded_error(exc): + raise QuotaExceededError("YouTube API quota exceeded.") from exc logger.error("fetch_videos: videos.list API error: %s", exc) return [] @@ -226,19 +326,38 @@ def fetch_videos( before_str = _to_rfc3339(published_before) seen_ids: set[str] = set(skip_video_ids or set()) all_videos: list[dict[str, Any]] = [] - - for query_text, ch_id in _build_queries(channel_title): - all_videos.extend( - _process_one_channel_query( - youtube, + query_pairs = _build_queries(channel_title) + max_queries = _get_max_query_pairs() + if len(query_pairs) > max_queries: + logger.warning( + "fetch_videos: query list truncated from %d to %d by YOUTUBE_MAX_QUERY_PAIRS", + len(query_pairs), + max_queries, + ) + query_pairs = query_pairs[:max_queries] + + for idx, (query_text, ch_id) in enumerate(query_pairs, start=1): + try: + all_videos.extend( + _process_one_channel_query( + youtube, + query_text, + ch_id, + after_str, + before_str, + seen_ids, + min_duration_seconds, + ) + ) + except QuotaExceededError: + logger.error( + "fetch_videos: quota exhausted at query %d/%d (%r). " + "Returning partial results collected so far.", + idx, + len(query_pairs), query_text, - ch_id, - after_str, - before_str, - seen_ids, - min_duration_seconds, ) - ) + break logger.info("fetch_videos: fetched %d videos", len(all_videos)) return all_videos diff --git a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py index 1ae922e..37afb50 100644 --- a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py +++ b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py @@ -36,9 +36,16 @@ get_or_create_video, link_speaker_to_video, link_tag_to_video, + remove_speaker_links_by_name, update_video_transcript, ) from cppa_youtube_script_tracker.transcript import download_vtt +from cppa_youtube_script_tracker.utils import ( + UNKNOWN_SPEAKER_NAME, + build_speaker_external_id, + clean_text, + resolve_speakers, +) from cppa_youtube_script_tracker.workspace import ( get_metadata_queue_path, get_raw_metadata_path, @@ -54,30 +61,6 @@ YOUTUBE_COOKIES_FILE = os.getenv("YOUTUBE_COOKIES_FILE", "youtube_cookies.txt") -def _clean_text(value: object) -> str: - """Return DB-safe text (PostgreSQL rejects NUL bytes).""" - if value is None: - return "" - value = str(value).replace("\x00", "").replace("\u2019", "'") - - return value - - -def _extract_speakers_from_title(title: str) -> list[str]: - """Heuristic: extract speaker names from talk titles like 'Topic - Speaker Name'. - - Returns a list of candidate names (may be empty if no pattern matched). - """ - if not title: - return [] - for sep in (" - ", " — ", " | "): - if sep in title: - candidate = title.split(sep)[-1].strip() - if candidate and len(candidate) < 80 and " " in candidate: - return [candidate] - return [] - - def _move_to_raw(video_id: str, queue_path) -> None: """Move a metadata JSON from queue to raw/metadata/ (permanent archive).""" try: @@ -97,25 +80,15 @@ def _move_to_raw(video_id: str, queue_path) -> None: def _persist_video(video_data: dict) -> tuple[bool, bool]: """Persist one video metadata dict to DB. Returns (created, skipped).""" - video_id = _clean_text(video_data.get("video_id", "")).strip() + video_id = clean_text(video_data.get("video_id", "")) if not video_id: return False, True - channel_id = _clean_text(video_data.get("channel_id", "")).strip() - channel_title = _clean_text(video_data.get("channel_title", "")).strip() + channel_id = clean_text(video_data.get("channel_id", "")) + channel_title = clean_text(video_data.get("channel_title", "")) channel = get_or_create_channel(channel_id, channel_title) if channel_id else None - metadata = { - "title": _clean_text(video_data.get("title", "")), - "description": _clean_text(video_data.get("description", "")), - "published_at": video_data.get("published_at"), - "duration_seconds": video_data.get("duration_seconds", 0), - "view_count": video_data.get("view_count"), - "like_count": video_data.get("like_count"), - "comment_count": video_data.get("comment_count"), - "search_term": _clean_text(video_data.get("search_term", "")), - "scraped_at": video_data.get("scraped_at"), - } + metadata = _build_video_metadata(video_data) try: video, created = get_or_create_video( @@ -126,36 +99,77 @@ def _persist_video(video_data: dict) -> tuple[bool, bool]: return False, True if created: - for name in _extract_speakers_from_title( - _clean_text(video_data.get("title", "")) - ): - try: - speaker, _ = get_or_create_youtube_speaker(display_name=name) - link_speaker_to_video(video, speaker) - except Exception: - logger.warning( - "_persist_video: could not link speaker %r to video %s", - name, - video_id, - ) - - for raw_tag in video_data.get("tags") or []: - tag_name = _clean_text(raw_tag).strip() - if not tag_name: - continue - try: - tag = get_or_create_tag(tag_name) - link_tag_to_video(video, tag) - except Exception: - logger.warning( - "_persist_video: could not link tag %r to video %s", - tag_name, - video_id, - ) + speaker_names = _resolve_video_speakers(video_data, channel_title) + _link_speakers(video, speaker_names, channel_id=channel_id, video_id=video_id) + _link_tags(video, video_data.get("tags") or [], video_id=video_id) return created, False +def _build_video_metadata(video_data: dict) -> dict: + return { + "title": clean_text(video_data.get("title", "")), + "description": clean_text(video_data.get("description", "")), + "published_at": video_data.get("published_at"), + "duration_seconds": video_data.get("duration_seconds", 0), + "view_count": video_data.get("view_count"), + "like_count": video_data.get("like_count"), + "comment_count": video_data.get("comment_count"), + "search_term": clean_text(video_data.get("search_term", "")), + "scraped_at": video_data.get("scraped_at"), + } + + +def _link_speakers( + video: YouTubeVideo, + speaker_names: list[str], + *, + channel_id: str, + video_id: str, +) -> None: + for name in speaker_names: + try: + speaker, _ = get_or_create_youtube_speaker( + external_id=build_speaker_external_id( + speaker_name=name, + channel_id=channel_id, + video_id=video_id, + ), + display_name=name, + ) + link_speaker_to_video(video, speaker) + except Exception: + logger.warning( + "_link_speakers: could not link speaker %r to video %s", + name, + video_id, + ) + + +def _link_tags(video: YouTubeVideo, raw_tags: list[str], *, video_id: str) -> None: + for raw_tag in raw_tags: + tag_name = clean_text(raw_tag) + if not tag_name: + continue + try: + tag = get_or_create_tag(tag_name) + link_tag_to_video(video, tag) + except Exception: + logger.warning( + "_link_tags: could not link tag %r to video %s", + tag_name, + video_id, + ) + + +def _resolve_video_speakers(video_data: dict, channel_title: str) -> list[str]: + return resolve_speakers( + title=clean_text(video_data.get("title", "")), + description=clean_text(video_data.get("description", "")), + channel_title=channel_title, + ) + + def _process_queue() -> tuple[int, int]: """Phase 1: load each metadata queue JSON, persist to DB, move to raw/metadata/. @@ -262,6 +276,63 @@ def _persist_fetched_video(vdata: dict) -> tuple[bool, bool]: return False, True +def _read_text_file(path: str) -> str: + try: + with open(path, "r", encoding="utf-8") as file_obj: + return file_obj.read() + except Exception: + return "" + + +def _enrich_speakers_from_transcript( + video_obj: YouTubeVideo, transcript_path: str +) -> None: + """Try transcript-based speaker extraction and replace unknown fallback if possible.""" + transcript_text = _read_text_file(transcript_path) + if not transcript_text: + return + + resolved = resolve_speakers( + title=clean_text(video_obj.title), + description=clean_text(video_obj.description), + channel_title=( + clean_text(video_obj.channel.channel_title) if video_obj.channel else "" + ), + transcript_text=transcript_text, + ) + if not resolved: + return + + # If we discovered a concrete speaker name, remove fallback "unkown" links first. + has_known = any( + name.casefold() != UNKNOWN_SPEAKER_NAME.casefold() for name in resolved + ) + if has_known: + remove_speaker_links_by_name(video_obj, UNKNOWN_SPEAKER_NAME) + + for name in resolved: + try: + speaker, _ = get_or_create_youtube_speaker( + external_id=build_speaker_external_id( + speaker_name=name, + channel_id=( + clean_text(video_obj.channel.channel_id) + if video_obj.channel + else "" + ), + video_id=video_obj.video_id, + ), + display_name=name, + ) + link_speaker_to_video(video_obj, speaker) + except Exception: + logger.warning( + "_enrich_speakers_from_transcript: could not link speaker %r to video %s", + name, + video_obj.video_id, + ) + + def _run_phase_2( start_time: datetime, end_time: datetime, @@ -310,6 +381,7 @@ def _run_phase_3() -> tuple[int, int]: if vtt_path: video_obj = YouTubeVideo.objects.get(video_id=vid) update_video_transcript(video_obj, str(vtt_path)) + _enrich_speakers_from_transcript(video_obj, str(vtt_path)) ok += 1 else: fail += 1 diff --git a/cppa_youtube_script_tracker/preprocessor.py b/cppa_youtube_script_tracker/preprocessor.py index cb3b233..1cc5c5b 100644 --- a/cppa_youtube_script_tracker/preprocessor.py +++ b/cppa_youtube_script_tracker/preprocessor.py @@ -102,17 +102,17 @@ def _build_candidate_queryset( normalized_failed: list[str], final_sync_at: datetime | None ): """Return the ORM queryset of candidates to preprocess.""" - queryset = YouTubeVideo._default_manager.select_related("channel").prefetch_related( # type: ignore[attr-defined] + queryset = YouTubeVideo.objects.select_related("channel").prefetch_related( "video_speakers__speaker" ) if final_sync_at is None and not normalized_failed: - return queryset.order_by("id") + return queryset.order_by("video_id") criteria = Q() if final_sync_at is not None: criteria |= Q(created_at__gt=final_sync_at) if normalized_failed: criteria |= Q(video_id__in=normalized_failed) - return queryset.filter(criteria).order_by("id") + return queryset.filter(criteria).order_by("video_id") def _build_video_metadata( diff --git a/cppa_youtube_script_tracker/services.py b/cppa_youtube_script_tracker/services.py index ce4e569..51fcb6e 100644 --- a/cppa_youtube_script_tracker/services.py +++ b/cppa_youtube_script_tracker/services.py @@ -113,6 +113,24 @@ def link_speaker_to_video( return join +def remove_speaker_links_by_name( + video: YouTubeVideo, + speaker_name: str, +) -> int: + """Remove all speaker links for a video where speaker.display_name matches speaker_name. + + Returns number of deleted join rows. + """ + speaker_name_val = (speaker_name or "").strip() + if not speaker_name_val: + return 0 + deleted, _ = YouTubeVideoSpeaker.objects.filter( + video=video, + speaker__display_name=speaker_name_val, + ).delete() + return int(deleted) + + def get_or_create_tag(tag_name: str) -> CppaTags: """Get or create a CppaTags entry by tag_name. diff --git a/cppa_youtube_script_tracker/transcript.py b/cppa_youtube_script_tracker/transcript.py index f1230db..585fdf4 100644 --- a/cppa_youtube_script_tracker/transcript.py +++ b/cppa_youtube_script_tracker/transcript.py @@ -21,8 +21,8 @@ "writeautomaticsub": True, "subtitleslangs": ["en"], "subtitlesformat": "vtt", - "quiet": True, - "no_warnings": True, + "quiet": False, + "no_warnings": False, "ignore_no_formats_error": True, "extractor_args": { "youtube": ["player_client=tv,web_safari"], diff --git a/cppa_youtube_script_tracker/utils.py b/cppa_youtube_script_tracker/utils.py new file mode 100644 index 0000000..71e242e --- /dev/null +++ b/cppa_youtube_script_tracker/utils.py @@ -0,0 +1,199 @@ +""" +Speaker extraction utilities for cppa_youtube_script_tracker. + +Priority order: +1) description patterns +2) title pattern +3) transcript introduction patterns +4) fallback to "unkown" +""" + +from __future__ import annotations + +import re +from typing import Iterable + +UNKNOWN_SPEAKER_NAME = "unkown" + +_SEPARATORS = (" - ", " — ", " | ") +_INTRO_RE = re.compile( + r"(?i)\b(?:i am|my name is)\s+([A-Z][A-Za-z'`-]*(?:\s+[A-Z][A-Za-z'`-]*){0,4})" +) + + +def clean_text(value: object) -> str: + if value is None: + return "" + return str(value).replace("\x00", "").replace("\u2019", "'").strip() + + +def _slugify_speaker_name(name: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "_", clean_text(name).lower()).strip("_") + return slug or "unknown" + + +def build_speaker_external_id( + speaker_name: str, + channel_id: str = "", + video_id: str = "", +) -> str: + """Build a stable speaker external identifier from channel/video context.""" + slug = _slugify_speaker_name(speaker_name) + channel_id = clean_text(channel_id) + video_id = clean_text(video_id) + if channel_id: + return f"youtube:channel:{channel_id}:speaker:{slug}" + if video_id: + return f"youtube:video:{video_id}:speaker:{slug}" + return f"youtube:name:{slug}" + + +def _normalize_name(name: str) -> str: + name = re.sub(r"\s+", " ", clean_text(name)) + name = name.strip(" .,:;\"'`-") + return name + + +def _dedupe_keep_order(values: Iterable[str]) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for value in values: + key = value.casefold() + if value and key not in seen: + seen.add(key) + out.append(value) + return out + + +def _extract_speaker_colon_line(description: str) -> list[str]: + # Example: "Speaker: Ehsan Amiri" + matches = re.findall(r"(?im)^\s*speaker\s*:\s*(.+?)\s*$", description or "") + return [_normalize_name(m) for m in matches if _normalize_name(m)] + + +def _extract_middle_name_from_triplet( + text: str, title: str = "", channel_title: str = "" +) -> str: + """ + Try parsing structures like: + {title} - {speaker} - {channel} + """ + text_norm = clean_text(text) + if not text_norm: + return "" + + for sep in _SEPARATORS: + if sep not in text_norm: + continue + parts = [_normalize_name(p) for p in text_norm.split(sep)] + parts = [p for p in parts if p] + if len(parts) < 3: + continue + candidate = parts[-2] + last = parts[-1].casefold() + first = parts[0].casefold() + title_cf = clean_text(title).casefold() + channel_cf = clean_text(channel_title).casefold() + + # Prefer high-confidence: title/speaker/channel match pattern. + if channel_cf and channel_cf in last: + return candidate + if title_cf and title_cf in first: + return candidate + + return "" + + +def _extract_from_intro_pattern(text: str) -> list[str]: + matches = _INTRO_RE.findall(text or "") + return [_normalize_name(m) for m in matches if _normalize_name(m)] + + +def extract_speakers_from_description( + description: str, title: str = "", channel_title: str = "" +) -> list[str]: + """ + Description-based speaker extraction: + - line starting with "Speaker:" + - 4th non-empty line pattern: {title} - {speaker} - {channel} + - intro pattern: "I am ..." / "my name is ..." + """ + description = clean_text(description) + if not description: + return [] + + speakers: list[str] = [] + speakers.extend(_extract_speaker_colon_line(description)) + + non_empty_lines = [ln.strip() for ln in description.splitlines() if ln.strip()] + if len(non_empty_lines) >= 4: + candidate = _extract_middle_name_from_triplet( + non_empty_lines[3], title=title, channel_title=channel_title + ) + if candidate: + speakers.append(candidate) + + for line in non_empty_lines: + candidate = _extract_middle_name_from_triplet( + line, title=title, channel_title=channel_title + ) + if candidate: + speakers.append(candidate) + + speakers.extend(_extract_from_intro_pattern(description)) + return _dedupe_keep_order(speakers) + + +def extract_speakers_from_title(title: str, channel_title: str = "") -> list[str]: + """ + Title-based extraction for structures: + {title} - {speaker} - {channel} + """ + title = clean_text(title) + if not title: + return [] + + candidate = _extract_middle_name_from_triplet(title, title=title, channel_title=channel_title) + if candidate: + return [candidate] + return [] + + +def extract_speakers_from_transcript_text(transcript_text: str) -> list[str]: + """ + Transcript fallback extraction using introduction patterns. + We prioritize early transcript content where introductions usually appear. + """ + transcript_text = clean_text(transcript_text) + if not transcript_text: + return [] + early_text = transcript_text[:8000] + return _dedupe_keep_order(_extract_from_intro_pattern(early_text)) + + +def resolve_speakers( + *, + title: str, + description: str, + channel_title: str = "", + transcript_text: str = "", +) -> list[str]: + """ + Resolve speakers using priority: + description -> title -> transcript -> ["unkown"] + """ + from_description = extract_speakers_from_description( + description=description, title=title, channel_title=channel_title + ) + if from_description: + return from_description + + from_title = extract_speakers_from_title(title=title, channel_title=channel_title) + if from_title: + return from_title + + from_transcript = extract_speakers_from_transcript_text(transcript_text) + if from_transcript: + return from_transcript + + return [UNKNOWN_SPEAKER_NAME] diff --git a/docs/service_api/cppa_youtube_script_tracker.md b/docs/service_api/cppa_youtube_script_tracker.md index 3d52dd0..337af77 100644 --- a/docs/service_api/cppa_youtube_script_tracker.md +++ b/docs/service_api/cppa_youtube_script_tracker.md @@ -51,7 +51,7 @@ | Function | Parameter types | Return type | Description | | ------------------------------- | -------------------------------------------------- | ---------------------------- | -------------------------------------------------------------------------------- | -| `get_or_create_youtube_speaker` | `display_name: str`, `identity: Identity \| None = None` | `tuple[YoutubeSpeaker, bool]` | Get or create a speaker by `display_name`. Raises `ValueError` if name is empty. | +| `get_or_create_youtube_speaker` | `external_id: str`, `display_name: str = ""`, `identity: Identity \| None = None` | `tuple[YoutubeSpeaker, bool]` | Get or create a speaker by `external_id`; updates `display_name` when provided. Raises `ValueError` if `external_id` is empty. | **Module path:** `cppa_user_tracker.services` @@ -83,12 +83,12 @@ Each document dict has: | ----------------------- | ----------- | --------------------------------------------------------------------------- | | `get_workspace_root()` | `Path` | `workspace/cppa_youtube_script_tracker/` | | `get_raw_dir()` | `Path` | `workspace/raw/cppa_youtube_script_tracker/` (permanent JSON archive) | -| `get_transcripts_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/transcripts/` (permanent VTT archive) | -| `get_queue_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/queue/` (short-lived; deleted after persist) | -| `get_raw_json_path(video_id)` | `Path` | Raw JSON archive path for a video. | -| `get_queue_json_path(video_id)` | `Path` | Queue JSON path for a video. | +| `get_raw_transcripts_dir()` | `Path` | `workspace/raw/cppa_youtube_script_tracker/transcripts/` (permanent VTT archive) | +| `get_metadata_queue_dir()` | `Path` | `workspace/cppa_youtube_script_tracker/metadata/` (short-lived; moved after persist) | +| `get_raw_metadata_path(video_id)` | `Path` | Raw metadata JSON archive path for a video. | +| `get_metadata_queue_path(video_id)` | `Path` | Metadata queue JSON path for a video. | | `get_transcript_path(video_id, lang="en")` | `Path` | VTT path for a video. | -| `iter_queue_jsons()` | `Iterator[Path]` | Yield all `*.json` files in the queue directory. | +| `iter_metadata_queue_jsons()` | `Iterator[Path]` | Yield all `*.json` files in the metadata queue directory. | --- From 6abde94faebf0c88b680220b873ec161ab2832e3 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 00:31:38 +0800 Subject: [PATCH 20/76] #38-fixed lint error --- cppa_youtube_script_tracker/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cppa_youtube_script_tracker/utils.py b/cppa_youtube_script_tracker/utils.py index 71e242e..ee669cf 100644 --- a/cppa_youtube_script_tracker/utils.py +++ b/cppa_youtube_script_tracker/utils.py @@ -153,7 +153,9 @@ def extract_speakers_from_title(title: str, channel_title: str = "") -> list[str if not title: return [] - candidate = _extract_middle_name_from_triplet(title, title=title, channel_title=channel_title) + candidate = _extract_middle_name_from_triplet( + title, title=title, channel_title=channel_title + ) if candidate: return [candidate] return [] From 516a0f8e48f332a90b73f09397039b86a65f6549 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 02:19:18 +0800 Subject: [PATCH 21/76] #38-addressed the review results of coderabbitai --- .../0007_youtubespeaker_external_id.py | 24 ++++---- cppa_user_tracker/models.py | 2 +- .../run_cppa_youtube_script_tracker.py | 57 +++++++------------ cppa_youtube_script_tracker/preprocessor.py | 2 +- cppa_youtube_script_tracker/services.py | 2 + .../cppa_youtube_script_tracker.md | 3 +- 6 files changed, 37 insertions(+), 53 deletions(-) diff --git a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py index 61f0304..4cd1607 100644 --- a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py +++ b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py @@ -1,21 +1,17 @@ +import re + from django.db import migrations, models -def _normalize_name(value: str) -> str: - value = (value or "").strip().lower() - chars = [] - for ch in value: - if ch.isalnum(): - chars.append(ch) - else: - chars.append("_") - slug = "".join(chars).strip("_") - while "__" in slug: - slug = slug.replace("__", "_") - return slug or "unknown" +def _slugify_speaker_name(name: str) -> str: + """Match cppa_youtube_script_tracker.utils._slugify_speaker_name (no channel/video).""" + s = (name or "").strip().lower() + s = re.sub(r"[^a-z0-9]+", "_", s).strip("_") + return s or "unknown" def populate_external_id(apps, schema_editor): + """Seed external_id using same format as build_speaker_external_id(..., "", "").""" YoutubeSpeaker = apps.get_model("cppa_user_tracker", "YoutubeSpeaker") used = set( @@ -27,8 +23,8 @@ def populate_external_id(apps, schema_editor): for speaker in YoutubeSpeaker.objects.all().order_by("baseprofile_ptr_id"): if speaker.external_id: continue - base = _normalize_name(speaker.display_name) - candidate = f"legacy:{base}" + slug = _slugify_speaker_name(speaker.display_name) + candidate = f"youtube:name:{slug}" if candidate in used: candidate = f"{candidate}:{speaker.baseprofile_ptr_id}" speaker.external_id = candidate diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 17b422c..4e7afd3 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -198,6 +198,6 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) external_id = models.CharField(max_length=255, unique=True) - display_name = models.CharField(max_length=255, db_index=True) + display_name = models.CharField(max_length=255, db_index=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py index 37afb50..4e96089 100644 --- a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py +++ b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py @@ -22,6 +22,7 @@ from typing import Optional from django.conf import settings +from django.core.exceptions import ValidationError from django.core.management import call_command from django.core.management.base import BaseCommand from django.utils.dateparse import parse_datetime @@ -69,13 +70,10 @@ def _move_to_raw(video_id: str, queue_path) -> None: shutil.move(str(queue_path), str(raw_path)) except Exception: logger.warning( - "_move_to_raw: could not move %s to raw/metadata/, removing instead", + "_move_to_raw: could not move %s to raw/metadata/, leaving in queue", queue_path, ) - try: - queue_path.unlink(missing_ok=True) - except Exception: - pass + return def _persist_video(video_data: dict) -> tuple[bool, bool]: @@ -94,14 +92,15 @@ def _persist_video(video_data: dict) -> tuple[bool, bool]: video, created = get_or_create_video( video_id=video_id, channel=channel, metadata_dict=metadata ) - except Exception: - logger.exception("_persist_video: failed to persist video_id=%s", video_id) + except (ValueError, ValidationError) as e: + logger.warning( + "_persist_video: validation error for video_id=%s: %s", video_id, e + ) return False, True - if created: - speaker_names = _resolve_video_speakers(video_data, channel_title) - _link_speakers(video, speaker_names, channel_id=channel_id, video_id=video_id) - _link_tags(video, video_data.get("tags") or [], video_id=video_id) + speaker_names = _resolve_video_speakers(video_data, channel_title) + _link_speakers(video, speaker_names, channel_id=channel_id, video_id=video_id) + _link_tags(video, video_data.get("tags") or [], video_id=video_id) return created, False @@ -128,22 +127,15 @@ def _link_speakers( video_id: str, ) -> None: for name in speaker_names: - try: - speaker, _ = get_or_create_youtube_speaker( - external_id=build_speaker_external_id( - speaker_name=name, - channel_id=channel_id, - video_id=video_id, - ), - display_name=name, - ) - link_speaker_to_video(video, speaker) - except Exception: - logger.warning( - "_link_speakers: could not link speaker %r to video %s", - name, - video_id, - ) + speaker, _ = get_or_create_youtube_speaker( + external_id=build_speaker_external_id( + speaker_name=name, + channel_id=channel_id, + video_id=video_id, + ), + display_name=name, + ) + link_speaker_to_video(video, speaker) def _link_tags(video: YouTubeVideo, raw_tags: list[str], *, video_id: str) -> None: @@ -151,15 +143,8 @@ def _link_tags(video: YouTubeVideo, raw_tags: list[str], *, video_id: str) -> No tag_name = clean_text(raw_tag) if not tag_name: continue - try: - tag = get_or_create_tag(tag_name) - link_tag_to_video(video, tag) - except Exception: - logger.warning( - "_link_tags: could not link tag %r to video %s", - tag_name, - video_id, - ) + tag = get_or_create_tag(tag_name) + link_tag_to_video(video, tag) def _resolve_video_speakers(video_data: dict, channel_title: str) -> list[str]: diff --git a/cppa_youtube_script_tracker/preprocessor.py b/cppa_youtube_script_tracker/preprocessor.py index 1cc5c5b..36e8454 100644 --- a/cppa_youtube_script_tracker/preprocessor.py +++ b/cppa_youtube_script_tracker/preprocessor.py @@ -109,7 +109,7 @@ def _build_candidate_queryset( return queryset.order_by("video_id") criteria = Q() if final_sync_at is not None: - criteria |= Q(created_at__gt=final_sync_at) + criteria |= Q(updated_at__gt=final_sync_at) if normalized_failed: criteria |= Q(video_id__in=normalized_failed) return queryset.filter(criteria).order_by("video_id") diff --git a/cppa_youtube_script_tracker/services.py b/cppa_youtube_script_tracker/services.py index 51fcb6e..7ec7877 100644 --- a/cppa_youtube_script_tracker/services.py +++ b/cppa_youtube_script_tracker/services.py @@ -40,6 +40,8 @@ def get_or_create_channel( Returns the YouTubeChannel instance. """ channel_id_val = (channel_id or "").strip() + if not channel_id_val: + raise ValueError("channel_id must not be empty.") channel_title_val = (channel_title or "").strip() channel, created = YouTubeChannel.objects.get_or_create( channel_id=channel_id_val, diff --git a/docs/service_api/cppa_youtube_script_tracker.md b/docs/service_api/cppa_youtube_script_tracker.md index 337af77..a429912 100644 --- a/docs/service_api/cppa_youtube_script_tracker.md +++ b/docs/service_api/cppa_youtube_script_tracker.md @@ -33,10 +33,11 @@ | `view_count` | int \| None | | | `like_count` | int \| None | | | `comment_count` | int \| None | | -| `tags` | list | | | `search_term` | str | Search term used to discover the video | | `scraped_at` | datetime or str | ISO string is parsed via `parse_datetime` | +Tags are not part of `metadata_dict`; use `get_or_create_tag` and `link_tag_to_video` (in this module) to associate tags with a video after creating or fetching it. + --- ## YouTubeVideoSpeaker From 5f445ee620bf1a2ae89372837bede40a47e802df Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 02:42:42 +0800 Subject: [PATCH 22/76] #38-addressed the coderabbitai's suggestions --- .../0005_youtubespeaker_alter_baseprofile_type.py | 4 ++++ .../migrations/0007_youtubespeaker_external_id.py | 12 ++++++++---- cppa_youtube_script_tracker/utils.py | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py index 0286793..7d2018e 100644 --- a/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py +++ b/cppa_user_tracker/migrations/0005_youtubespeaker_alter_baseprofile_type.py @@ -23,6 +23,10 @@ class Migration(migrations.Migration): to="cppa_user_tracker.baseprofile", ), ), + ( + "external_id", + models.CharField(blank=True, max_length=255, null=True), + ), ("display_name", models.CharField(db_index=True, max_length=255)), ("created_at", models.DateTimeField(auto_now_add=True)), ("updated_at", models.DateTimeField(auto_now=True)), diff --git a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py index 4cd1607..199289e 100644 --- a/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py +++ b/cppa_user_tracker/migrations/0007_youtubespeaker_external_id.py @@ -49,10 +49,14 @@ class Migration(migrations.Migration): ), reverse_sql=migrations.RunSQL.noop, ), - migrations.AddField( - model_name="youtubespeaker", - name="external_id", - field=models.CharField(blank=True, max_length=255, null=True), + # Add column if missing (no-op when 0005 already created it; required when + # upgrading from pre-fix 0005 that did not include external_id). + migrations.RunSQL( + sql=( + "ALTER TABLE cppa_user_tracker_youtubespeaker " + "ADD COLUMN IF NOT EXISTS external_id VARCHAR(255) NULL;" + ), + reverse_sql=migrations.RunSQL.noop, ), migrations.RunPython(populate_external_id, migrations.RunPython.noop), migrations.AlterField( diff --git a/cppa_youtube_script_tracker/utils.py b/cppa_youtube_script_tracker/utils.py index ee669cf..0d6c28e 100644 --- a/cppa_youtube_script_tracker/utils.py +++ b/cppa_youtube_script_tracker/utils.py @@ -5,7 +5,7 @@ 1) description patterns 2) title pattern 3) transcript introduction patterns -4) fallback to "unkown" +4) fallback to "unknown" """ from __future__ import annotations @@ -13,7 +13,7 @@ import re from typing import Iterable -UNKNOWN_SPEAKER_NAME = "unkown" +UNKNOWN_SPEAKER_NAME = "unknown" _SEPARATORS = (" - ", " — ", " | ") _INTRO_RE = re.compile( @@ -182,7 +182,7 @@ def resolve_speakers( ) -> list[str]: """ Resolve speakers using priority: - description -> title -> transcript -> ["unkown"] + description -> title -> transcript -> ["unknown"] """ from_description = extract_speakers_from_description( description=description, title=title, channel_title=channel_title From bf571ec52f8d1eba010c11296afc55f350ca5877 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 02:59:21 +0800 Subject: [PATCH 23/76] #38-fixed a query issue --- .../commands/run_cppa_youtube_script_tracker.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py index 4e96089..7e3cf4d 100644 --- a/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py +++ b/cppa_youtube_script_tracker/management/commands/run_cppa_youtube_script_tracker.py @@ -350,15 +350,12 @@ def _run_phase_3() -> tuple[int, int]: Saves directly to raw/transcripts/ (never deleted). Returns (ok_count, fail_count). """ - pending = list( - YouTubeVideo.objects.filter(has_transcript=False).values_list( - "video_id", flat=True - ) - ) + pending = list(YouTubeVideo.objects.filter(has_transcript=False)) transcripts_dir = get_raw_transcripts_dir() ok = 0 fail = 0 - for vid in pending: + for video_obj in pending: + vid = video_obj.video_id try: vtt_path = download_vtt( vid, output_dir=transcripts_dir, cookies_file=YOUTUBE_COOKIES_FILE From 36eb704643f8838840225905d6ba32af5420895e Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 14 Mar 2026 03:03:28 +0800 Subject: [PATCH 24/76] #38-fix timezone issue --- cppa_youtube_script_tracker/fetcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cppa_youtube_script_tracker/fetcher.py b/cppa_youtube_script_tracker/fetcher.py index a55c0df..e7872a7 100644 --- a/cppa_youtube_script_tracker/fetcher.py +++ b/cppa_youtube_script_tracker/fetcher.py @@ -144,6 +144,8 @@ def _to_rfc3339(dt: datetime) -> str: """Format a datetime as RFC 3339 (required by YouTube API publishedAfter/Before).""" if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) return dt.strftime("%Y-%m-%dT%H:%M:%SZ") From 3e32ee28977eb2780ef898554481a12e9ca72dcd Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 21 Mar 2026 04:43:12 -0700 Subject: [PATCH 25/76] refactor(wg21): pipeline dispatch + mailing range; remove Cloud Run stack #113 --- config/settings.py | 13 +- docs/operations/WG21_Cloud_Run.md | 65 -- docs/operations/WG21_GitHub_Dispatch.md | 69 ++ requirements.txt | 2 - wg21_paper_tracker/cloud_run_job/Dockerfile | 28 - .../cloud_run_job/converters/__init__.py | 9 - .../converters/docling_converter.py | 61 -- .../converters/openai_converter.py | 338 ---------- .../converters/pdfplumber_converter.py | 102 --- wg21_paper_tracker/cloud_run_job/main.py | 118 ---- .../cloud_run_job/requirements.txt | 6 - wg21_paper_tracker/fetcher.py | 143 +++-- .../commands/import_wg21_metadata_from_csv.py | 237 ++++--- .../commands/run_wg21_paper_tracker.py | 189 ++++-- wg21_paper_tracker/pipeline.py | 591 +++++++++--------- wg21_paper_tracker/tests/test_commands.py | 120 ++++ wg21_paper_tracker/tests/test_fetcher.py | 68 ++ wg21_paper_tracker/tests/test_pipeline.py | 282 +++++---- 18 files changed, 1084 insertions(+), 1357 deletions(-) delete mode 100644 docs/operations/WG21_Cloud_Run.md create mode 100644 docs/operations/WG21_GitHub_Dispatch.md delete mode 100644 wg21_paper_tracker/cloud_run_job/Dockerfile delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/__init__.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/docling_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/openai_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/main.py delete mode 100644 wg21_paper_tracker/cloud_run_job/requirements.txt diff --git a/config/settings.py b/config/settings.py index 17f549a..b92f137 100644 --- a/config/settings.py +++ b/config/settings.py @@ -217,11 +217,14 @@ ).resolve() # WG21 Paper Tracker Configuration -WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() -GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() -GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="") or "").strip() -WG21_CLOUD_RUN_ENABLED = env.bool("WG21_CLOUD_RUN_ENABLED", default=False) +WG21_GITHUB_DISPATCH_ENABLED = env.bool("WG21_GITHUB_DISPATCH_ENABLED", default=False) +WG21_GITHUB_DISPATCH_REPO = (env("WG21_GITHUB_DISPATCH_REPO", default="") or "").strip() +WG21_GITHUB_DISPATCH_TOKEN = ( + env("WG21_GITHUB_DISPATCH_TOKEN", default="") or "" +).strip() +WG21_GITHUB_DISPATCH_EVENT_TYPE = ( + env("WG21_GITHUB_DISPATCH_EVENT_TYPE", default="wg21_papers_convert") or "" +).strip() or "wg21_papers_convert" # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md deleted file mode 100644 index b1caccf..0000000 --- a/docs/operations/WG21_Cloud_Run.md +++ /dev/null @@ -1,65 +0,0 @@ -# WG21 Paper Conversion Cloud Run Job - -The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. - -When `WG21_CLOUD_RUN_ENABLED=true` and `WG21_CLOUD_RUN_JOB_NAME` is set, the Django tracker (`run_wg21_paper_tracker`) triggers the configured Cloud Run job after uploading new papers. - -## 1. Setup Google Cloud Storage - -Create a GCS bucket (e.g., `wg21-data-collector`). - -Ensure your Django app has the following environment variables configured: - -- `WG21_GCS_BUCKET`: The name of the GCS bucket. -- `GCP_PROJECT_ID`: Your GCP project ID. -- `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. -- `WG21_CLOUD_RUN_ENABLED`: (Optional, default `false`) Set to `true` to allow the tracker to trigger the Cloud Run conversion job when new papers are uploaded. Keeps the trigger optional even when project and bucket are set. -- `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. - -## 2. Build and Push the Docker Image - -Navigate to the Cloud Run job directory: - -```bash -cd wg21_paper_tracker/cloud_run_job/ -``` - -Build the Docker image. Replace `[PROJECT_ID]` with your GCP Project ID: - -```bash -docker build -t gcr.io/[PROJECT_ID]/wg21-convert . -``` - -Push the image to Google Container Registry (or Artifact Registry): - -```bash -docker push gcr.io/[PROJECT_ID]/wg21-convert -``` - -## 3. Create the Cloud Run Job - -Create the job in Google Cloud. We recommend allocating sufficient memory and CPU since Docling and PDFPlumber are resource-intensive. - -```bash -gcloud run jobs create wg21-convert \ - --image gcr.io/[PROJECT_ID]/wg21-convert \ - --memory 8Gi \ - --cpu 4 \ - --region us-central1 \ - --set-env-vars WG21_GCS_BUCKET=wg21-data-collector -``` - -Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manager](https://cloud.google.com/run/docs/configuring/secrets)) rather than inline in `--set-env-vars`, to avoid leaking the key into shell history, CI logs, or audit trails. - -## 4. Service Account & IAM Permissions - -1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. -2. **GCS Access:** Both the Django application and the Cloud Run job require read/write access to the GCS bucket (`roles/storage.objectAdmin`). - -## 5. Flow Summary - -1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. -2. It checks the WG21 site for new mailings. -3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. -4. If Cloud Run triggering is enabled, it calls the configured Cloud Run job. -5. The Cloud Run Job then spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/docs/operations/WG21_GitHub_Dispatch.md b/docs/operations/WG21_GitHub_Dispatch.md new file mode 100644 index 0000000..49046b0 --- /dev/null +++ b/docs/operations/WG21_GitHub_Dispatch.md @@ -0,0 +1,69 @@ +# WG21 Paper Tracker → GitHub Actions (`repository_dispatch`) + +The Django app **`run_wg21_paper_tracker`** scrapes WG21 mailings and stores paper metadata in the database. It does **not** download PDFs or other documents. When **new** paper rows are created in a run, it can send **one** [repository dispatch](https://docs.github.com/en/rest/repos/repos#create-a-repository-dispatch-event) to another GitHub repository so a workflow there fetches each URL and runs conversion (e.g. PDF → Markdown). + +## Environment variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `WG21_GITHUB_DISPATCH_ENABLED` | No (default `false`) | Set to `true` to send `repository_dispatch` when there are new papers. | +| `WG21_GITHUB_DISPATCH_REPO` | Yes, if enabled | Target repo as `owner/repo` (the repo whose workflow will run). | +| `WG21_GITHUB_DISPATCH_TOKEN` | Yes, if enabled | PAT or token with permission to create repository dispatch events on that repo (classic PAT: `repo` scope for private repos). | +| `WG21_GITHUB_DISPATCH_EVENT_TYPE` | No | Must match `on.repository_dispatch.types` in the target workflow. Default: `wg21_papers_convert`. | + +## `client_payload` contract + +The JSON body includes only a list of URL strings: + +```json +{ + "papers": [ + "https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/…", + "https://www.open-std.org/…" + ] +} +``` + +- **`papers`**: array of strings (WG21 document URLs), all new papers from **that** pipeline run in a **single** event. +- There is **no** `new_paper_count` field; use `length(papers)` in the workflow if needed. + +## Target repository workflow (example) + +```yaml +on: + repository_dispatch: + types: [wg21_papers_convert] + +jobs: + convert: + runs-on: ubuntu-latest + steps: + - name: URLs + run: | + echo '${{ toJson(github.event.client_payload.papers) }}' + # Fetch each URL, convert, store artifacts / upload elsewhere +``` + +In expressions, `github.event.client_payload.papers` is a JSON array of strings. + +## Token security + +Store `WG21_GITHUB_DISPATCH_TOKEN` in a secret manager or CI secret—never commit it. Prefer a fine-grained PAT scoped to the conversion repo if possible. + +## Payload size + +Very large mailings could produce many URLs in one payload. If you approach GitHub or runner limits, document a split strategy (multiple dispatches) as an edge case; the default is one dispatch per tracker run with the full list. + +## CLI options + +- **`--from-date YYYY-MM`**: Process mailings with `mailing_date >= YYYY-MM` (WG21 / CSV style). Backfills from that key onward when used alone. +- **`--to-date YYYY-MM`**: Upper bound: `mailing_date <= YYYY-MM`. With `--from-date`, the run uses the inclusive range `[from, to]`. Without `--from-date`, behavior stays incremental (only mailings **newer than** the latest `WG21Mailing` in the DB), but capped at `to`—useful to avoid pulling very new mailings in a controlled run. +- **`--dry-run`**: Log only; do not run the pipeline or send dispatch. + +## Flow summary + +1. Scheduler runs `run_wg21_paper_tracker` (optionally with `--from-date` / `--to-date`). +2. Pipeline fetches mailings, upserts `WG21Mailing` / `WG21Paper` (metadata only). +3. For each row **newly created** in that run, its document URL is collected. +4. If the list is non-empty and dispatch is enabled, the app POSTs once to `POST /repos/{owner}/{repo}/dispatches` with `event_type` and `client_payload: { "papers": [ ... ] }`. +5. The conversion repo’s workflow runs and downloads each URL. diff --git a/requirements.txt b/requirements.txt index a94ab8d..80db52f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,5 +15,3 @@ selenium>=4.35 # wg21_paper_tracker app beautifulsoup4>=4.12.0 -google-cloud-run>=0.10.1 -google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile deleted file mode 100644 index d52244b..0000000 --- a/wg21_paper_tracker/cloud_run_job/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Use an official Python runtime as a parent image -FROM python:3.11-slim - -# Set working directory -WORKDIR /app - -# Install system dependencies required by converters (e.g. Poppler for PDF image extraction) -RUN apt-get update && apt-get install -y --no-install-recommends \ - poppler-utils \ - libgl1-mesa-glx \ - libglib2.0-0 \ - && rm -rf /var/lib/apt/lists/* - -RUN groupadd -r app && useradd -r -g app app - -# Copy requirements -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application files -COPY --chown=app:app . . - -USER app - -# Run the main script -CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/__init__.py b/wg21_paper_tracker/cloud_run_job/converters/__init__.py deleted file mode 100644 index 515d30a..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -PDF to Markdown converters module. -""" - -from .docling_converter import convert_with_docling -from .pdfplumber_converter import convert_with_pdfplumber -from .openai_converter import convert_with_openai - -__all__ = ["convert_with_docling", "convert_with_pdfplumber", "convert_with_openai"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py deleted file mode 100644 index 7e73753..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Docling-based PDF to Markdown converter. -""" - -from pathlib import Path -from typing import Optional -import logging - -logger = logging.getLogger(__name__) - -try: - from docling.document_converter import DocumentConverter # type: ignore[import-untyped] - from docling.datamodel.base_models import InputFormat # type: ignore[import-untyped] - - DOCLING_AVAILABLE = True -except ImportError: - DocumentConverter = None # type: ignore[assignment,misc] - InputFormat = None # type: ignore[assignment,misc] - DOCLING_AVAILABLE = False - logger.warning("Docling not available. Install with: pip install docling") - - -def convert_with_docling(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using Docling. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not DOCLING_AVAILABLE or DocumentConverter is None: - logger.error("Docling is not available") - return None - - try: - logger.info(f"Attempting Docling conversion for: {pdf_path.name}") - - # Initialize converter - converter = DocumentConverter() - - # Convert PDF to document - result = converter.convert(pdf_path) - - # Extract markdown - markdown_content = result.document.export_to_markdown() - - if markdown_content and len(markdown_content.strip()) > 0: - logger.info(f"Docling conversion successful for: {pdf_path.name}") - logger.info(f"Extracted {len(markdown_content)} characters") - return markdown_content - else: - logger.warning( - f"Docling conversion returned empty content for: {pdf_path.name}" - ) - return None - - except Exception: - logger.error(f"Docling conversion failed for {pdf_path.name}", exc_info=True) - return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py deleted file mode 100644 index 66c08b0..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -OpenAI/OpenRouter-based PDF to Markdown converter with OCR. -""" - -from __future__ import annotations - -import base64 -import io -import logging -import os -import shutil -import tempfile -import time -from pathlib import Path -from typing import Optional - -import requests - -logger = logging.getLogger(__name__) - -# Base configuration fallback - - -OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") -OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" -OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") - -try: - from pdf2image import convert_from_path - from PIL import Image, ImageOps - - PDF2IMAGE_AVAILABLE = True -except ImportError: - PDF2IMAGE_AVAILABLE = False - logger.warning( - "pdf2image/PIL not available. Install with: pip install pdf2image pillow" - ) - - -def pdf_to_images(pdf_path: Path) -> tuple[Optional[Path], list[Path]]: - """ - Convert PDF pages to image files on disk (one per page) to avoid loading all into memory. - - Writes images into a temporary directory and returns (tmp_dir, paths). Caller must process - each path and then remove tmp_dir (e.g. shutil.rmtree) so only the current page is resident. - - Note: pdf2image should automatically handle PDF rotation metadata; we also apply - additional rotation correction in correct_image_rotation() when loading each image. - - Args: - pdf_path: Path to the PDF file. - - Returns: - (tmp_dir, list of image paths). tmp_dir is None on failure or if pdf2image unavailable; - paths are in page order. Caller must cleanup tmp_dir when not None. - """ - if not PDF2IMAGE_AVAILABLE: - logger.error("pdf2image is not available") - return (None, []) - - try: - logger.info(f"Converting PDF to images: {pdf_path.name}") - tmp_dir = Path(tempfile.mkdtemp(prefix="wg21_pdf_")) - try: - path_strs = convert_from_path( - pdf_path, - dpi=200, - paths_only=True, - output_folder=str(tmp_dir), - ) - paths = [Path(p) for p in path_strs] - logger.info(f"Converted {len(paths)} pages to images") - return (tmp_dir, paths) - except Exception: - shutil.rmtree(tmp_dir, ignore_errors=True) - raise - except Exception as e: - logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) - return (None, []) - - -def correct_image_rotation(image: Image.Image) -> Image.Image: - """ - Correct image rotation using EXIF data and heuristics. - - Args: - image: PIL Image object. - - Returns: - Corrected PIL Image object. - """ - try: - # First, try to correct using EXIF orientation data - # This handles images that have rotation metadata - corrected_image = ImageOps.exif_transpose(image) - - # If the image was rotated, log it - if corrected_image != image: - logger.debug("Image rotation corrected using EXIF data") - return corrected_image - - # If no EXIF data, check if image might be rotated - # For PDF pages, we can check if width > height suggests landscape - # But we'll keep the original orientation as PDFs can be in any orientation - # The OpenAI vision model can handle rotated text, but it's better to correct it - - return corrected_image - - except Exception as e: - logger.warning(f"Error correcting image rotation: {str(e)}") - return image - - -def image_to_base64(image: Image.Image) -> str: - """ - Convert PIL Image to base64 string. - Automatically corrects rotation before encoding. - - Args: - image: PIL Image object. - - Returns: - Base64 encoded string. - """ - # Correct rotation before encoding - corrected_image = correct_image_rotation(image) - - buffered = io.BytesIO() - corrected_image.save(buffered, format="PNG") - img_str = base64.b64encode(buffered.getvalue()).decode() - return img_str - - -def convert_page_with_openai( - image_base64: str, page_num: int, total_pages: int -) -> Optional[str]: - """ - Convert a single page image to markdown using OpenAI/OpenRouter. - - Args: - image_base64: Base64 encoded image string. - page_num: Current page number. - total_pages: Total number of pages. - - Returns: - Markdown content for the page, or None if conversion fails. - """ - if not OPENROUTER_API_KEY: - logger.error("OpenRouter API key is not set") - return None - - url = f"{OPENROUTER_BASE_URL}/chat/completions" - headers = { - "Authorization": f"Bearer {OPENROUTER_API_KEY}", - "Content-Type": "application/json", - } - - payload = { - "model": OPENROUTER_MODEL, - "messages": [ - { - "role": "system", - "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", - }, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - ], - }, - ], - "max_tokens": 4000, - } - - max_attempts = 3 # initial + 2 retries - retry_delays = [1, 2] # exponential backoff in seconds - - for attempt in range(max_attempts): - try: - logger.info( - f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter" - + (f" (attempt {attempt + 1}/{max_attempts})" if attempt > 0 else "") - ) - - response = requests.post(url, json=payload, headers=headers, timeout=120) - response.raise_for_status() - - result = response.json() - markdown_content = result["choices"][0]["message"]["content"] - - logger.info( - f"Successfully converted page {page_num} with OpenAI/OpenRouter" - ) - return markdown_content - - except ( - requests.exceptions.Timeout, - requests.exceptions.ConnectionError, - ) as e: - retryable = attempt < max_attempts - 1 - if retryable: - delay = retry_delays[attempt] - logger.warning( - f"Transient error on page {page_num} ({type(e).__name__}), " - f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" - ) - time.sleep(delay) - else: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - except requests.exceptions.HTTPError as e: - status_code = e.response.status_code if e.response is not None else None - retryable = ( - attempt < max_attempts - 1 - and status_code is not None - and (status_code == 429 or 500 <= status_code < 600) - ) - if retryable: - delay = retry_delays[attempt] - logger.warning( - f"HTTP {status_code} on page {page_num}, " - f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" - ) - time.sleep(delay) - else: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - return None - - -def convert_with_openai(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using OpenAI/OpenRouter with OCR. - Processes each page as an image. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not OPENROUTER_API_KEY: - logger.error("OpenRouter API key is not set in environment variables") - return None - - if not PDF2IMAGE_AVAILABLE: - logger.error("pdf2image is required for OpenAI conversion") - return None - - try: - logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - - try: - # Convert PDF to image files on disk (avoids loading all pages into memory) - tmp_dir, paths = pdf_to_images(pdf_path) - if not paths: - logger.error(f"Failed to convert PDF to images: {pdf_path.name}") - return None - - total_pages = len(paths) - markdown_parts = [] - successful_pages = 0 - # Process each page: load one image at a time, convert, then move on - for page_num, image_path in enumerate(paths, 1): - try: - with Image.open(image_path) as img: - img.load() - image_base64 = image_to_base64(img) - # Convert page with OpenAI - page_markdown = convert_page_with_openai( - image_base64, page_num, total_pages - ) - - if page_markdown: - markdown_parts.append(page_markdown) - markdown_parts.append("\n\n") - successful_pages += 1 - else: - logger.warning(f"Failed to convert page {page_num} with OpenAI") - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - - except Exception as e: - logger.error( - f"Error processing page {page_num}: {str(e)}", - exc_info=True, - ) - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - continue - finally: - if tmp_dir is not None: - shutil.rmtree(tmp_dir, ignore_errors=True) - - markdown_content = "".join(markdown_parts) - - if successful_pages > 0 and markdown_content.strip(): - logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") - logger.info( - f"Extracted {len(markdown_content)} characters from {total_pages} pages" - ) - return markdown_content - logger.warning( - "OpenAI/OpenRouter conversion produced no usable pages for: %s", - pdf_path.name, - ) - return None - - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for {pdf_path.name}: {str(e)}", - exc_info=True, - ) - return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py deleted file mode 100644 index fb36c4e..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -PDFPlumber-based PDF to Markdown converter. -""" - -from pathlib import Path -from typing import Optional -import logging - -logger = logging.getLogger(__name__) - -try: - import pdfplumber - - PDFPLUMBER_AVAILABLE = True -except ImportError: - PDFPLUMBER_AVAILABLE = False - logger.warning("PDFPlumber not available. Install with: pip install pdfplumber") - - -def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using PDFPlumber. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not PDFPLUMBER_AVAILABLE: - logger.error("PDFPlumber is not available") - return None - - try: - logger.info(f"Attempting PDFPlumber conversion for: {pdf_path.name}") - - markdown_parts = [] - - with pdfplumber.open(pdf_path) as pdf: - total_pages = len(pdf.pages) - logger.info(f"Processing {total_pages} pages with PDFPlumber") - - for page_num, page in enumerate(pdf.pages, 1): - try: - # Extract text from page - text = page.extract_text() - - if text: - markdown_parts.append(text.replace("\n", " \n")) - markdown_parts.append("\n\n") - - # Extract tables if any - tables = page.extract_tables() - if tables: - for table in tables: - if table: - markdown_parts.append("\n### Table\n\n") - first_row = True - # Convert table to markdown format - for row in table: - if row: - markdown_parts.append( - "| " - + " | ".join( - "" if cell is None else str(cell) - for cell in row - ) - + " |\n" - ) - if first_row: - markdown_parts.append( - "| " - + " | ".join("---" for _ in row) - + " |\n" - ) - first_row = False - markdown_parts.append("\n") - - except Exception as e: - logger.warning( - f"Error processing page {page_num} of {pdf_path.name}: {str(e)}" - ) - continue - - markdown_content = "".join(markdown_parts) - - if markdown_content and len(markdown_content.strip()) > 0: - logger.info(f"PDFPlumber conversion successful for: {pdf_path.name}") - logger.info(f"Extracted {len(markdown_content)} characters") - return markdown_content - else: - logger.warning( - f"PDFPlumber conversion returned empty content for: {pdf_path.name}" - ) - return None - - except Exception as e: - logger.error( - f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", - exc_info=True, - ) - return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py deleted file mode 100644 index 61c57dc..0000000 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import logging -from pathlib import Path -import tempfile -from typing import Optional - -from google.cloud import storage - -from converters.docling_converter import convert_with_docling -from converters.pdfplumber_converter import convert_with_pdfplumber -from converters.openai_converter import convert_with_openai - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger(__name__) - -MIN_CONTENT_LENGTH = 50 - - -def is_content_valid(content: Optional[str]) -> bool: - if not content: - return False - content_stripped = content.strip() - if len(content_stripped) < MIN_CONTENT_LENGTH: - return False - error_patterns = [ - "traceback", - "exception:", - "error:", - "failed to", - "unable to convert", - "conversion failed", - "error processing", - ] - content_lower = content_stripped.lower() - first_part = content_lower[:1000] - for pattern in error_patterns: - if pattern in first_part: - if pattern in ("error:", "exception:"): - return False - idx = first_part.find(pattern) - if idx < 100: - return False - return True - - -def convert_pdf_to_md(pdf_path: Path) -> str: - logger.info("Attempting Docling conversion...") - content = convert_with_docling(pdf_path) - if is_content_valid(content): - return content - - logger.info("Attempting PDFPlumber conversion...") - content = convert_with_pdfplumber(pdf_path) - if is_content_valid(content): - return content - - logger.info("Attempting OpenAI conversion...") - content = convert_with_openai(pdf_path) - if is_content_valid(content): - return content - - return "" - - -def main(): - bucket_name = os.getenv("WG21_GCS_BUCKET") - if not bucket_name: - logger.error("WG21_GCS_BUCKET env var not set.") - raise RuntimeError("WG21_GCS_BUCKET env var not set.") - - client = storage.Client() - bucket = client.bucket(bucket_name) - - raw_prefix = "raw/wg21_paper_tracker/" - converted_prefix = "converted/wg21_papers/" - - blobs = client.list_blobs(bucket, prefix=raw_prefix) - - with tempfile.TemporaryDirectory() as tmpdir: - for blob in blobs: - if not blob.name.lower().endswith(".pdf"): - continue - - local_pdf_path = Path(tmpdir) / "temp.pdf" - try: - # e.g. raw/wg21_paper_tracker/2025/2025-02/p0149r1.pdf -> 2025/2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix) :] - md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" - md_blob_name = f"{converted_prefix}{md_relative_path}" - - md_blob = bucket.blob(md_blob_name) - if md_blob.exists(): - logger.info("Skipping %s, MD already exists.", blob.name) - continue - - logger.info("Downloading %s to process...", blob.name) - blob.download_to_filename(str(local_pdf_path)) - - logger.info("Converting %s...", blob.name) - md_content = convert_pdf_to_md(local_pdf_path) - - if md_content: - md_blob.upload_from_string(md_content, content_type="text/markdown") - logger.info("Successfully converted and uploaded %s", md_blob_name) - else: - logger.error("Failed to convert %s", blob.name) - except Exception: - logger.exception("Failed processing %s", blob.name) - finally: - if local_pdf_path.exists(): - local_pdf_path.unlink() - - -if __name__ == "__main__": - main() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt deleted file mode 100644 index 82422b1..0000000 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -docling>=1.0.0 -pdfplumber>=0.10.0 -pdf2image>=1.16.0 -Pillow>=12.1.1 -requests>=2.31.0 -google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index 2c6ad03..05f6e98 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -18,6 +18,86 @@ BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" _MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$") +# Paper link in first column: e.g. p1234r0.pdf, n4920.html, sd-9.md +_PAPER_LINK_PATTERN = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) + + +def extract_paper_metadata_from_table_row( + cells: list[Tag], + page_url: str, +) -> Optional[dict]: + """ + Extract paper metadata from a WG21 mailing table row (td/th cells). + + Current year pages (e.g. 2026) use eight columns:: + + WG21 Number | Title | Author | Document Date | Mailing Date | + Previous Version | Subgroup | Disposition + + So **subgroup is index 6**, not 4. Index 4 is *mailing date* (string as shown on the site). + + Older pages used a shorter row (five data columns); then subgroup was at index 4. + If ``len(cells) >= 8`` we use the 8-column layout; otherwise we keep the legacy mapping. + """ + if not cells: + return None + + first_cell = cells[0] + base = urllib.parse.urlparse(BASE_URL) + + title = "" + if len(cells) > 1: + title = cells[1].text.strip() + + authors: list[str] = [] + if len(cells) > 2: + authors_raw = cells[2].text.strip() + if authors_raw: + authors = [ + a.strip() for a in re.split(r",| and ", authors_raw) if a.strip() + ] + + document_date = None + if len(cells) > 3: + date_str = cells[3].text.strip() + if date_str: + document_date = date_str + + # 8+ columns: mailing date [4], previous version [5], subgroup [6], disposition [7] + subgroup = "" + if len(cells) >= 8: + subgroup = cells[6].text.strip() + elif len(cells) > 4: + subgroup = cells[4].text.strip() + + for link in first_cell.find_all("a", href=True): + href = link.get("href", "") + match = _PAPER_LINK_PATTERN.search(href) + if not match: + continue + + paper_url = urllib.parse.urljoin(page_url, href) + parsed = urllib.parse.urlparse(paper_url) + if parsed.scheme not in ("https", "http") or parsed.netloc != base.netloc: + logger.warning("Skipping off-origin paper URL %s", paper_url) + continue + + paper_id = match.group(1).lower() + file_ext = match.group(2).lower() + filename = match.group(0).lower() + + return { + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + } + + return None def _find_table_in_section(anchor) -> Optional[Tag]: @@ -112,72 +192,15 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: return [] paper_urls = [] - paper_pattern = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) for row in table.find_all("tr"): cells = row.find_all(["td", "th"]) if not cells or any(cell.get("colspan") for cell in cells): continue - # Usually: Number, Title, Author, Date, Subgroup - if len(cells) >= 1: - first_cell = cells[0] - for link in first_cell.find_all("a", href=True): - href = link.get("href", "") - match = paper_pattern.search(href) - if match: - paper_url = urllib.parse.urljoin(url, href) - parsed = urllib.parse.urlparse(paper_url) - base = urllib.parse.urlparse(BASE_URL) - if ( - parsed.scheme not in ("https", "http") - or parsed.netloc != base.netloc - ): - logger.warning("Skipping off-origin paper URL %s", paper_url) - continue - - paper_id = match.group(1).lower() - file_ext = match.group(2).lower() - filename = match.group(0).lower() - - title = "" - if len(cells) > 1: - title = cells[1].text.strip() - - authors = [] - if len(cells) > 2: - authors_raw = cells[2].text.strip() - # Split by comma or 'and' if multiple - if authors_raw: - authors = [ - a.strip() - for a in re.split(r",| and ", authors_raw) - if a.strip() - ] - - document_date = None - if len(cells) > 3: - date_str = cells[3].text.strip() - if date_str: - document_date = date_str # Will be parsed/saved in pipeline - - subgroup = "" - if len(cells) > 4: - subgroup = cells[4].text.strip() - - paper_urls.append( - { - "url": paper_url, - "filename": filename, - "type": file_ext, - "paper_id": paper_id, - "title": title, - "authors": authors, - "document_date": document_date, - "subgroup": subgroup, - } - ) - break # Only take the first paper link in the cell + paper = extract_paper_metadata_from_table_row(cells, url) + if paper: + paper_urls.append(paper) # Remove exact duplicates (same filename) seen = set() diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index feff138..824617a 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -7,10 +7,15 @@ (unknown / Unknown). """ +from __future__ import annotations + import csv import logging import re +from dataclasses import dataclass +from datetime import date from pathlib import Path +from typing import Optional from django.core.management.base import BaseCommand, CommandError from django.db import IntegrityError @@ -93,6 +98,131 @@ def _read_csv_rows(csv_path: Path): yield out +@dataclass(frozen=True) +class _CsvImportRow: + paper_id: str + url: str + mailing_date: str + mailing_title: str + document_date: Optional[date] + year: Optional[int] + title: str + subgroup: str + author_names: list[str] + + +def _parse_csv_import_row(row: dict) -> _CsvImportRow | None: + """Return parsed row, or None when paper_id or url is missing.""" + paper_id = (row.get("paper_id", "") or "").strip().lower() + url = row.get("url", "") + if not paper_id or not url: + return None + + mailing_date, mailing_title = _resolve_mailing_date(row.get("mailing_date", "")) + document_date = _parse_document_date(row.get("date", "")) + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): + year = int(mailing_date[:4]) + elif document_date is not None: + year = document_date.year + else: + year = None + title = row.get("title", "") or paper_id + subgroup = row.get("subgroup", "") + author_names = _author_names_from_csv(row.get("author", "")) + return _CsvImportRow( + paper_id=paper_id, + url=url, + mailing_date=mailing_date, + mailing_title=mailing_title, + document_date=document_date, + year=year, + title=title, + subgroup=subgroup, + author_names=author_names, + ) + + +def _log_dry_run_row(parsed: _CsvImportRow) -> None: + logger.info( + "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", + parsed.paper_id, + parsed.mailing_date, + parsed.document_date, + len(parsed.author_names), + ) + + +def _attach_csv_authors_to_paper(paper: WG21Paper, author_names: list[str]) -> None: + from cppa_user_tracker.services import ( + get_or_create_wg21_paper_author_profile, + ) + + for i, name in enumerate(author_names): + profile, _ = get_or_create_wg21_paper_author_profile(name) + get_or_create_paper_author(paper, profile, i + 1) + + +def _update_paper_on_integrity_error( + parsed: _CsvImportRow, exc: IntegrityError, stats: dict +) -> None: + mailing, _ = get_or_create_mailing(parsed.mailing_date, parsed.mailing_title) + try: + lookup_year = parsed.year if parsed.year is not None else 0 + paper = WG21Paper.objects.filter( + paper_id=parsed.paper_id, year=lookup_year + ).first() + if paper is None: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, exc) + return + paper.url = parsed.url + paper.title = parsed.title + paper.document_date = parsed.document_date + paper.mailing = mailing + paper.subgroup = parsed.subgroup + if parsed.year is not None: + paper.year = parsed.year + paper.save() + stats["papers_updated"] += 1 + if parsed.author_names: + _attach_csv_authors_to_paper(paper, parsed.author_names) + except Exception: + stats["skipped"] += 1 + logger.exception( + "Error for paper_id=%s (after IntegrityError).", + parsed.paper_id, + ) + + +def _upsert_paper_from_csv_row(parsed: _CsvImportRow, stats: dict) -> None: + try: + mailing, mailing_created = get_or_create_mailing( + parsed.mailing_date, parsed.mailing_title + ) + if mailing_created: + stats["mailings_created"] += 1 + + _paper, paper_created = get_or_create_paper( + paper_id=parsed.paper_id, + url=parsed.url, + title=parsed.title, + document_date=parsed.document_date, + mailing=mailing, + subgroup=parsed.subgroup, + author_names=parsed.author_names if parsed.author_names else None, + year=parsed.year, + ) + if paper_created: + stats["papers_created"] += 1 + else: + stats["papers_updated"] += 1 + except IntegrityError as e: + _update_paper_on_integrity_error(parsed, e, stats) + except Exception as e: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, e) + + class Command(BaseCommand): help = ( "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). " @@ -133,34 +263,11 @@ def handle(self, *args, **options): for row in _read_csv_rows(csv_path): stats["rows"] += 1 - paper_id = (row.get("paper_id", "") or "").strip().lower() - url = row.get("url", "") - - if not paper_id or not url: - stats["skipped"] += 1 - if stats["skipped"] <= 5: - logger.debug( - "Skipping row: missing paper_id or url: %s", - row.get("paper_id", "") or row.get("url", "")[:50], - ) - continue - - mailing_date, mailing_title = _resolve_mailing_date( - row.get("mailing_date", "") - ) try: - document_date = _parse_document_date(row.get("date", "")) - if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): - year = int(mailing_date[:4]) - elif document_date is not None: - year = document_date.year - else: - year = None - title = row.get("title", "") or paper_id - subgroup = row.get("subgroup", "") - author_names = _author_names_from_csv(row.get("author", "")) + parsed = _parse_csv_import_row(row) except Exception as e: stats["skipped"] += 1 + paper_id = (row.get("paper_id", "") or "").strip().lower() logger.error( "Error parsing document date for paper_id=%s: %s", paper_id, @@ -168,78 +275,20 @@ def handle(self, *args, **options): ) continue + if parsed is None: + stats["skipped"] += 1 + if stats["skipped"] <= 5: + logger.debug( + "Skipping row: missing paper_id or url: %s", + row.get("paper_id", "") or row.get("url", "")[:50], + ) + continue + if dry_run: - logger.info( - "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", - paper_id, - mailing_date, - document_date, - len(author_names), - ) + _log_dry_run_row(parsed) continue - try: - mailing, mailing_created = get_or_create_mailing( - mailing_date, mailing_title - ) - if mailing_created: - stats["mailings_created"] += 1 - - paper, paper_created = get_or_create_paper( - paper_id=paper_id, - url=url, - title=title, - document_date=document_date, - mailing=mailing, - subgroup=subgroup, - author_names=author_names if author_names else None, - year=year, - ) - if paper_created: - stats["papers_created"] += 1 - else: - stats["papers_updated"] += 1 - except IntegrityError as e: - # Re-resolve mailing (IntegrityError may have come from get_or_create_mailing race) - mailing, _ = get_or_create_mailing(mailing_date, mailing_title) - # Duplicate (paper_id, year): fetch existing by same key and update - try: - lookup_year = year if year is not None else 0 - paper = WG21Paper.objects.filter( - paper_id=paper_id, year=lookup_year - ).first() - if paper is None: - stats["skipped"] += 1 - logger.error("Error for paper_id=%s: %s", paper_id, e) - else: - paper.url = url - paper.title = title - paper.document_date = document_date - paper.mailing = mailing - paper.subgroup = subgroup - if year is not None: - paper.year = year - paper.save() - stats["papers_updated"] += 1 - if author_names: - from cppa_user_tracker.services import ( - get_or_create_wg21_paper_author_profile, - ) - - for i, name in enumerate(author_names): - profile, _ = get_or_create_wg21_paper_author_profile( - name - ) - get_or_create_paper_author(paper, profile, i + 1) - except Exception: - stats["skipped"] += 1 - logger.exception( - "Error for paper_id=%s (after IntegrityError).", - paper_id, - ) - except Exception as e: - stats["skipped"] += 1 - logger.error("Error for paper_id=%s: %s", paper_id, e) + _upsert_paper_from_csv_row(parsed, stats) logger.info( "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d", diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index b1885af..3f0965d 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -1,97 +1,158 @@ """ Management command for WG21 Paper Tracker. -Runs the pipeline to fetch new mailings, download papers, upload to GCS, and update DB. -If new papers were found and uploaded, it triggers the Google Cloud Run conversion job. +Runs the pipeline to fetch new mailings, upsert paper metadata in the DB, and optionally +trigger a GitHub repository_dispatch so another repo can download and convert documents. """ import logging -from django.core.management.base import BaseCommand + +import requests from django.conf import settings +from django.core.management.base import BaseCommand, CommandError from wg21_paper_tracker.pipeline import run_tracker_pipeline logger = logging.getLogger(__name__) - -def trigger_cloud_run_job(project_id: str, location: str, job_name: str): - """ - Start the named Cloud Run job (run once, no polling). - - Uses the Cloud Run v2 API to trigger the job identified by project_id, - location, and job_name. The job runs asynchronously; this function returns - the operation and does not wait for the job to finish. - """ - from google.cloud import run_v2 - - client = run_v2.JobsClient() - name = client.job_path(project_id, location, job_name) - request = run_v2.RunJobRequest(name=name) - logger.info("Triggering Cloud Run job %s...", name) - operation = client.run_job(request=request) - logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) - return operation +GITHUB_DISPATCH_URL = "https://api.github.com/repos/{repo}/dispatches" + + +def trigger_github_repository_dispatch( + repo: str, + event_type: str, + token: str, + paper_urls: list[str], +) -> None: + """POST repository_dispatch with client_payload {"papers": [, ...]}.""" + url = GITHUB_DISPATCH_URL.format(repo=repo.strip()) + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token.strip()}", + "X-GitHub-Api-Version": "2022-11-28", + } + body = { + "event_type": event_type, + "client_payload": {"papers": paper_urls}, + } + logger.info( + "Sending repository_dispatch to %s (event_type=%s, %d URLs).", + repo, + event_type, + len(paper_urls), + ) + response = requests.post(url, json=body, headers=headers, timeout=30) + if not response.ok: + logger.error( + "GitHub repository_dispatch failed: %s %s", + response.status_code, + response.text, + ) + response.raise_for_status() class Command(BaseCommand): - """Run WG21 paper tracker and optionally trigger the Cloud Run conversion job.""" + """Run WG21 paper tracker and optionally trigger GitHub repository_dispatch.""" - help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + help = ( + "Run WG21 paper tracker (scrape, DB update) and send new paper URLs via " + "repository_dispatch when enabled." + ) def add_arguments(self, parser): - """Register --dry-run so the command can skip pipeline and Cloud Run.""" parser.add_argument( "--dry-run", action="store_true", - help="Only log what would be done; do not run the pipeline or trigger Cloud Run.", + help="Only log what would be done; do not run the pipeline or dispatch.", + ) + parser.add_argument( + "--from-date", + dest="from_date", + metavar="YYYY-MM", + default=None, + help=( + "Process mailings with mailing_date >= YYYY-MM (WG21 / CSV style). " + "Backfills from that mailing onward; without --to-date, no upper cap." + ), + ) + parser.add_argument( + "--to-date", + dest="to_date", + metavar="YYYY-MM", + default=None, + help=( + "Upper bound: mailing_date <= YYYY-MM. With --from-date, inclusive range; " + "without --from-date, still only mailings newer than DB latest (capped at to)." + ), ) def handle(self, *args, **options): - """ - Run the tracker pipeline; if new papers were uploaded, trigger the Cloud Run job. - - With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. - Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0, WG21_CLOUD_RUN_ENABLED is True, and - GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and WG21_GCS_BUCKET are set. - """ dry_run = options.get("dry_run", False) + from_date = options.get("from_date") + to_date = options.get("to_date") + if from_date is not None: + from_date = from_date.strip() + if not from_date: + from_date = None + if to_date is not None: + to_date = to_date.strip() + if not to_date: + to_date = None if dry_run: - logger.info("Dry run: skipping pipeline and Cloud Run trigger.") + if from_date or to_date: + logger.info( + "Dry run: skipping pipeline and GitHub dispatch " + "(from=%r, to=%r).", + from_date, + to_date, + ) + else: + logger.info("Dry run: skipping pipeline and GitHub dispatch.") return logger.info("Starting WG21 Paper Tracker...") try: - total_new_papers = run_tracker_pipeline() - logger.info("Processed %d new papers.", total_new_papers) - - if total_new_papers > 0: - project_id = getattr(settings, "GCP_PROJECT_ID", None) - location = getattr(settings, "GCP_LOCATION", "us-central1") - job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) - bucket = getattr(settings, "WG21_GCS_BUCKET", None) - cloud_run_enabled = getattr(settings, "WG21_CLOUD_RUN_ENABLED", False) - - if project_id and job_name and bucket and cloud_run_enabled: - try: - trigger_cloud_run_job(project_id, location, job_name) - logger.info( - "Successfully triggered Cloud Run job %s.", job_name - ) - except Exception: - logger.exception( - "Failed to trigger Cloud Run job %s.", job_name - ) - raise - else: - logger.warning( - "Skipping Cloud Run trigger: set WG21_CLOUD_RUN_ENABLED=True " - "and configure GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and " - "WG21_GCS_BUCKET to enable." - ) - else: - logger.info("No new papers found. Skipping Cloud Run job.") - + result = run_tracker_pipeline( + from_mailing_date=from_date, + to_mailing_date=to_date, + ) + n = result.new_paper_count + logger.info("Recorded %d new paper(s); %d URL(s) for dispatch.", n, n) + + if not n: + logger.info("No new papers in this run. Skipping GitHub dispatch.") + return + + repo = getattr(settings, "WG21_GITHUB_DISPATCH_REPO", "") or "" + token = getattr(settings, "WG21_GITHUB_DISPATCH_TOKEN", "") or "" + enabled = getattr(settings, "WG21_GITHUB_DISPATCH_ENABLED", False) + event_type = getattr( + settings, + "WG21_GITHUB_DISPATCH_EVENT_TYPE", + "wg21_papers_convert", + ) + + if not enabled or not repo or not token: + logger.warning( + "Skipping GitHub dispatch: set WG21_GITHUB_DISPATCH_ENABLED=True " + "and configure WG21_GITHUB_DISPATCH_REPO and " + "WG21_GITHUB_DISPATCH_TOKEN." + ) + return + try: + trigger_github_repository_dispatch( + repo, + event_type, + token, + list(result.new_paper_urls), + ) + logger.info("repository_dispatch sent successfully.") + except Exception: + logger.exception("Failed to send repository_dispatch.") + raise + + except ValueError as e: + raise CommandError(str(e)) from e except Exception as e: logger.exception("WG21 Paper Tracker failed: %s", e) raise diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 516837e..ff15f50 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -1,114 +1,293 @@ """ Pipeline for WG21 Paper Tracker. -Coordinates scraping, downloading, uploading to GCS, and updating the database. +Coordinates scraping and updating the database (metadata only; no file download or GCS). """ -import time -import requests +from __future__ import annotations + import logging -from pathlib import Path +import re +from dataclasses import dataclass, field +from datetime import date, datetime +from typing import Any, Optional -from django.conf import settings -from google.cloud import storage +from django.utils.dateparse import parse_date from wg21_paper_tracker.fetcher import ( fetch_all_mailings, fetch_papers_for_mailing, ) -from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.models import WG21Mailing from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, ) -from wg21_paper_tracker.workspace import get_raw_dir logger = logging.getLogger(__name__) -DOWNLOAD_TIMEOUT = 30 -DOWNLOAD_MAX_RETRIES = 3 -DOWNLOAD_RETRY_DELAY = 2 +# WG21 mailing_date and typical CSV column (e.g. 2025-03, 2026-01) +_MAILING_DATE_LABEL_RE = re.compile(r"^\d{4}-\d{2}$") + + +def _normalize_mailing_date_label(label: str, *, field_name: str) -> str: + s = label.strip() + if not _MAILING_DATE_LABEL_RE.match(s): + raise ValueError( + f"Invalid {field_name} {label!r}; " + "expected YYYY-MM (e.g. 2025-03), same as WG21 / CSV mailing keys." + ) + return s -def _upload_to_gcs( - bucket_name: str, source_path: Path, destination_blob_name: str +def _mailing_date_in_run_scope( + mailing_date: str, + *, + latest_date: str, + from_mailing_date: Optional[str], + to_mailing_date: Optional[str], ) -> bool: - """Uploads a file to the bucket.""" + """Whether a mailing key is selected for this run (before retry merge).""" + if from_mailing_date is None and to_mailing_date is None: + return mailing_date > latest_date + + if from_mailing_date is not None and mailing_date < from_mailing_date: + return False + if to_mailing_date is not None and mailing_date > to_mailing_date: + return False + if from_mailing_date is None and to_mailing_date is not None: + return mailing_date > latest_date + return True + + +def _format_priority(ext: str) -> int: + """Prefer adoc > html > ps > pdf when multiple formats exist for one paper_id.""" + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} + return priorities.get(ext.lower(), 100) + + +def _parse_mailing_year(m_info: dict) -> int: + """Return 4-digit year from the index mailing dict, or 0 if missing/invalid.""" + mailing_date = m_info["mailing_date"] + year_raw = m_info.get("year") + if not year_raw or not str(year_raw).strip(): + logger.warning( + "Mailing %s: year missing or empty, using 0 (fix later).", + mailing_date, + ) + return 0 try: - storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name) - blob = bucket.blob(destination_blob_name) + year = int(str(year_raw).strip()[:4]) + except (ValueError, TypeError): + logger.warning( + "Mailing %s: year not parseable %r, using 0 (fix later).", + mailing_date, + year_raw, + ) + return 0 + if year <= 0 or year > datetime.now().year + 1: + logger.warning( + "Mailing %s: year invalid, using 0 (fix later).", + mailing_date, + ) + return 0 + return year - blob.upload_from_filename(str(source_path)) - logger.info( - "Uploaded %s to gs://%s/%s", - source_path.name, - bucket_name, - destination_blob_name, + +def _group_fetched_papers_by_id( + papers: list[dict[str, Any]], mailing_date: str +) -> dict[str, list[dict[str, Any]]]: + """Bucket fetcher rows by normalized paper_id.""" + papers_by_id: dict[str, list[dict[str, Any]]] = {} + for p in papers: + pid = (p.get("paper_id") or "").strip().lower() + if not pid: + logger.warning( + "Skipping paper entry without a paper_id in mailing %s: %r", + mailing_date, + p, + ) + continue + papers_by_id.setdefault(pid, []).append(p) + return papers_by_id + + +def _valid_paper_entries_for_id( + p_list: list[dict[str, Any]], pid: str, mailing_date: str +) -> list[dict[str, Any]]: + """Keep rows that have type, url, and title (all non-empty).""" + valid: list[dict[str, Any]] = [] + for p in p_list: + type_val = ( + (p.get("type") or "").strip() if isinstance(p.get("type"), str) else "" + ) + url_val = (p.get("url") or "").strip() if isinstance(p.get("url"), str) else "" + title_val = ( + (p.get("title") or "").strip() if isinstance(p.get("title"), str) else "" ) - return True + if not type_val or not url_val or not title_val: + logger.debug( + "Skipping malformed paper entry for %s in mailing %s: %r", + pid, + mailing_date, + p, + ) + continue + valid.append(p) + return valid + + +def _choose_best_format_entry(valid_list: list[dict[str, Any]]) -> dict[str, Any]: + """Pick one row by format priority (adoc first). Precondition: valid_list non-empty.""" + return min( + valid_list, + key=lambda x: _format_priority(str(x.get("type") or "").strip()), + ) + + +def _parse_scraped_document_date(doc_date_str: Any) -> Optional[date]: + if not doc_date_str: + return None + try: + return parse_date(str(doc_date_str).strip()) except Exception as e: - logger.error("Failed to upload to GCS: %s", e) - return False + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + return None + + +def _upsert_paper_from_scraped_row( + pid: str, + best_paper: dict[str, Any], + mailing_obj: WG21Mailing, + year: int, + mailing_date: str, +) -> Optional[str]: + """ + Create or update WG21Paper from the chosen fetcher row. + Returns the document URL if a **new** row was inserted, else None. + """ + url = (best_paper.get("url") or "").strip() + paper_title = (best_paper.get("title") or "").strip() + subgroup = (best_paper.get("subgroup") or "").strip() + authors = best_paper.get("authors") + if not isinstance(authors, list): + authors = [] + if not url or not paper_title: + logger.warning( + "Skipping paper %s in mailing %s due to missing required fields: %r", + pid, + mailing_date, + best_paper, + ) + return None + + doc_date = _parse_scraped_document_date(best_paper.get("document_date")) + _paper_obj, created = get_or_create_paper( + paper_id=pid, + url=url, + title=paper_title, + document_date=doc_date, + mailing=mailing_obj, + subgroup=subgroup, + author_names=authors, + year=year, + ) + return url if created else None -def _download_file(url: str, filepath: Path) -> bool: - """Download file from URL to filepath with retries and 30s timeout.""" - for attempt in range(1, DOWNLOAD_MAX_RETRIES + 1): - try: - logger.info( - "Downloading %s to %s (attempt %d/%d)", - url, - filepath, - attempt, - DOWNLOAD_MAX_RETRIES, +def _process_single_mailing(m_info: dict) -> list[str]: + """ + For one mailing from the index: normalize year, get/create WG21Mailing, + fetch paper rows from the site, upsert WG21Paper rows. + + Returns URLs for papers **newly created** in this run for this mailing. + """ + mailing_date = m_info["mailing_date"] + title = m_info["title"] + year = _parse_mailing_year(m_info) + mailing_obj, _ = get_or_create_mailing(mailing_date, title) + + papers = fetch_papers_for_mailing(str(year), mailing_date) + if not papers: + logger.info( + "Mailing %s: no papers found (anchor/table may be missing).", + mailing_date, + ) + return [] + + papers_by_id = _group_fetched_papers_by_id(papers, mailing_date) + new_urls: list[str] = [] + + for pid, p_list in papers_by_id.items(): + valid_list = _valid_paper_entries_for_id(p_list, pid, mailing_date) + if not valid_list: + logger.warning( + "Skipping paper %s in mailing %s: no valid entries (type, url, title)", + pid, + mailing_date, ) - response = requests.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True) - response.raise_for_status() - - # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. - content_type = response.headers.get("content-type", "") - if "text" in content_type: - with open(filepath, "w", encoding="utf-8") as f: - f.write( - response.content.decode( - response.apparent_encoding or "utf-8", - errors="replace", - ) - ) - else: - with open(filepath, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - return True - except Exception as e: - if attempt < DOWNLOAD_MAX_RETRIES: - logger.warning( - "Download attempt %d/%d failed for %s: %s. Retrying in %ds.", - attempt, - DOWNLOAD_MAX_RETRIES, - url, - e, - DOWNLOAD_RETRY_DELAY, - ) - time.sleep(DOWNLOAD_RETRY_DELAY) - else: - logger.error( - "Failed to download %s after %d attempts: %s", - url, - DOWNLOAD_MAX_RETRIES, - e, - ) - return False - - -def run_tracker_pipeline() -> int: + continue + best_paper = _choose_best_format_entry(valid_list) + url = _upsert_paper_from_scraped_row( + pid, best_paper, mailing_obj, year, mailing_date + ) + if url: + new_urls.append(url) + + return new_urls + + +@dataclass(frozen=True) +class TrackerPipelineResult: + """Result of run_tracker_pipeline: URLs for papers newly created in this run.""" + + new_paper_urls: tuple[str, ...] = field(default_factory=tuple) + + @property + def new_paper_count(self) -> int: + return len(self.new_paper_urls) + + +def run_tracker_pipeline( + *, + from_mailing_date: Optional[str] = None, + to_mailing_date: Optional[str] = None, +) -> TrackerPipelineResult: """ - Run the WG21 tracker pipeline. - Returns the number of new papers downloaded and uploaded. + Run the WG21 tracker pipeline: scrape mailings, upsert papers in the DB. + Returns URLs for rows created in this run (for GitHub repository_dispatch). + + Mailing keys are ``YYYY-MM`` (WG21 / CSV style). Selection: + + - Neither ``from_mailing_date`` nor ``to_mailing_date``: process mailings with + ``mailing_date`` strictly newer than the latest ``WG21Mailing`` in the DB. + - ``from_mailing_date`` only: ``mailing_date >= from_mailing_date``. + - ``to_mailing_date`` only: ``mailing_date > latest_in_db`` and + ``mailing_date <= to_mailing_date`` (incremental runs capped at ``to``). + - Both: ``from_mailing_date <= mailing_date <= to_mailing_date`` (inclusive). + + ``from_mailing_date`` must not be lexicographically after ``to_mailing_date``. """ - bucket_name = settings.WG21_GCS_BUCKET - if not bucket_name: - logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") + if from_mailing_date is not None: + from_mailing_date = _normalize_mailing_date_label( + from_mailing_date, field_name="from_mailing_date" + ) + if to_mailing_date is not None: + to_mailing_date = _normalize_mailing_date_label( + to_mailing_date, field_name="to_mailing_date" + ) + if ( + from_mailing_date is not None + and to_mailing_date is not None + and from_mailing_date > to_mailing_date + ): + raise ValueError( + f"from_mailing_date {from_mailing_date!r} is after " + f"to_mailing_date {to_mailing_date!r}." + ) # 1. Get latest mailing from DB latest_mailing = ( @@ -122,23 +301,49 @@ def run_tracker_pipeline() -> int: all_mailings = fetch_all_mailings() if not all_mailings: logger.warning("No mailings found on WG21 site.") - return 0 + return TrackerPipelineResult() + + # Filter mailings to process + new_mailings = [ + m + for m in all_mailings + if _mailing_date_in_run_scope( + m["mailing_date"], + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + ] + if from_mailing_date is None and to_mailing_date is None: + baseline_desc = f"latest_in_db={latest_date}" + else: + parts: list[str] = [] + if from_mailing_date is not None: + parts.append(f"from={from_mailing_date}") + if to_mailing_date is not None: + parts.append(f"to={to_mailing_date}") + if from_mailing_date is None: + parts.append(f"latest_in_db={latest_date}") + baseline_desc = ", ".join(parts) - # Filter newer mailings - new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] # Requeue incomplete mailings so transient failures get retried (not just the latest) retry_dates = set( WG21Mailing.objects.filter(papers__isnull=True).values_list( "mailing_date", flat=True ) ) - retry_dates.update( - WG21Mailing.objects.filter(papers__is_downloaded=False).values_list( - "mailing_date", flat=True - ) - ) if latest_mailing: retry_dates.add(latest_mailing.mailing_date) + retry_dates = { + d + for d in retry_dates + if _mailing_date_in_run_scope( + d, + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + } for current_m in all_mailings: if current_m["mailing_date"] in retry_dates and current_m[ "mailing_date" @@ -149,211 +354,13 @@ def run_tracker_pipeline() -> int: new_mailings.sort(key=lambda x: x["mailing_date"]) logger.info( - "Pipeline: latest_date=%s, all_mailings=%d, mailings_to_process=%s", - latest_date, + "Pipeline: %s, all_mailings=%d, mailings_to_process=%s", + baseline_desc, len(all_mailings), [m["mailing_date"] for m in new_mailings], ) - total_new_papers = 0 - + new_urls: list[str] = [] for m_info in new_mailings: - mailing_date = m_info["mailing_date"] - title = m_info["title"] - # Normalize year once; use 0 when missing/empty/unparseable so you can fix later - year_raw = m_info.get("year") - if not year_raw or not str(year_raw).strip(): - year = 0 - logger.warning( - "Mailing %s: year missing or empty, using 0 (fix later).", - mailing_date, - ) - else: - try: - year = int(str(year_raw).strip()[:4]) - if year <= 0: - year = 0 - logger.warning( - "Mailing %s: year invalid, using 0 (fix later).", - mailing_date, - ) - except (ValueError, TypeError): - year = 0 - logger.warning( - "Mailing %s: year not parseable %r, using 0 (fix later).", - mailing_date, - year_raw, - ) - - # Create/get mailing in DB - mailing_obj, _ = get_or_create_mailing(mailing_date, title) - - # Fetch papers for this mailing - papers = fetch_papers_for_mailing(str(year), mailing_date) - if not papers: - logger.info( - "Mailing %s: no papers found (anchor/table may be missing).", - mailing_date, - ) - continue - - # Group papers by ID so we can choose the preferred source format per paper. - papers_by_id = {} - for p in papers: - pid = (p.get("paper_id") or "").strip().lower() - if not pid: - logger.warning( - "Skipping paper entry without a paper_id in mailing %s: %r", - mailing_date, - p, - ) - continue - if pid not in papers_by_id: - papers_by_id[pid] = [] - papers_by_id[pid].append(p) - - def format_priority(ext: str) -> int: - priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} - return priorities.get(ext.lower(), 100) - - raw_dir = get_raw_dir(mailing_date, year) - - skipped_downloaded = 0 - for pid, p_list in papers_by_id.items(): - # Skip only if this (paper_id, year) is already downloaded - if WG21Paper.objects.filter( - paper_id=pid, - year=year, - is_downloaded=True, - ).exists(): - skipped_downloaded += 1 - continue - - # Filter to entries with required keys and valid types; skip malformed. - valid_list = [] - for p in p_list: - type_val = ( - (p.get("type") or "").strip() - if isinstance(p.get("type"), str) - else "" - ) - url_val = ( - (p.get("url") or "").strip() - if isinstance(p.get("url"), str) - else "" - ) - title_val = ( - (p.get("title") or "").strip() - if isinstance(p.get("title"), str) - else "" - ) - if not type_val or not url_val or not title_val: - logger.debug( - "Skipping malformed paper entry for %s in mailing %s: %r", - pid, - mailing_date, - p, - ) - continue - valid_list.append(p) - - if not valid_list: - logger.warning( - "Skipping paper %s in mailing %s: no valid entries (type, url, title)", - pid, - mailing_date, - ) - continue - - # Pick the preferred format: adoc > html > ps > pdf. - best_paper = min( - valid_list, - key=lambda x: format_priority(str(x.get("type") or "").strip()), - ) - url = (best_paper.get("url") or "").strip() - title = (best_paper.get("title") or "").strip() - subgroup = (best_paper.get("subgroup") or "").strip() - authors = best_paper.get("authors") - if not isinstance(authors, list): - authors = [] - if not url or not title: - logger.warning( - "Skipping paper %s in mailing %s due to missing required fields: %r", - pid, - mailing_date, - best_paper, - ) - continue - - raw_filename = (best_paper.get("filename") or "").strip() - filename = Path(raw_filename).name - if not filename or filename != raw_filename: - logger.warning( - "Skipping paper %s due to unsafe filename %r", - pid, - raw_filename, - ) - continue - local_path = raw_dir / filename - - # Persist paper row before transfer so failed downloads remain retry candidates - doc_date_str = best_paper.get("document_date") - from django.utils.dateparse import parse_date - - doc_date = None - if doc_date_str: - try: - doc_date = parse_date(doc_date_str) - except Exception as e: - logger.warning( - "Failed to parse document date: %s: %s", - doc_date_str, - e, - ) - doc_date = None - - paper_obj, _created = get_or_create_paper( - paper_id=pid, - url=url, - title=title, - document_date=doc_date, - mailing=mailing_obj, - subgroup=subgroup, - author_names=authors, - year=year, - ) - - # Download - if _download_file(url, local_path): - uploaded = False - if bucket_name: - gcs_path = ( - f"raw/wg21_paper_tracker/{year}/{mailing_date}/{filename}" - ) - uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) - else: - logger.warning( - "WG21_GCS_BUCKET is not configured; leaving %s as not downloaded.", - pid, - ) - - if uploaded: - paper_obj.is_downloaded = True - paper_obj.save(update_fields=["is_downloaded", "updated_at"]) - total_new_papers += 1 - - # Clean up local file to save space - # try: - # # local_path.unlink() - # except Exception as e: - # logger.warning( - # "Could not delete temp file %s: %s", local_path, e - # ) - - if skipped_downloaded: - logger.info( - "Mailing %s: skipped %d papers (already downloaded).", - mailing_date, - skipped_downloaded, - ) + new_urls.extend(_process_single_mailing(m_info)) - return total_new_papers + return TrackerPipelineResult(new_paper_urls=tuple(new_urls)) diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py index 34a52e9..a099ccb 100644 --- a/wg21_paper_tracker/tests/test_commands.py +++ b/wg21_paper_tracker/tests/test_commands.py @@ -1,12 +1,18 @@ """Tests for wg21_paper_tracker management commands.""" +from unittest.mock import MagicMock, patch + import pytest from django.core.management import call_command from django.core.management.base import CommandError +from django.test.utils import override_settings + +from wg21_paper_tracker.pipeline import TrackerPipelineResult CMD_NAME = "import_wg21_metadata_from_csv" +RUN_TRACKER_CMD = "run_wg21_paper_tracker" def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): @@ -16,3 +22,117 @@ def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): with pytest.raises(CommandError, match=r"File not found:"): call_command(CMD_NAME, f"--csv-file={csv_path}") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_posts_dispatch_when_enabled(): + """run_wg21_paper_tracker sends repository_dispatch with papers URL list.""" + mock_resp = MagicMock() + mock_resp.ok = True + mock_resp.status_code = 204 + mock_resp.text = "" + + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult( + new_paper_urls=("https://open-std.org/a.pdf", "https://open-std.org/b.pdf") + ), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + return_value=mock_resp, + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="myorg/convert-repo", + WG21_GITHUB_DISPATCH_TOKEN="secret-token", + WG21_GITHUB_DISPATCH_EVENT_TYPE="wg21_papers_convert", + ): + call_command(RUN_TRACKER_CMD) + + m_post.assert_called_once() + assert m_post.call_args[0][0] == ( + "https://api.github.com/repos/myorg/convert-repo/dispatches" + ) + body = m_post.call_args[1]["json"] + assert body["event_type"] == "wg21_papers_convert" + assert body["client_payload"] == { + "papers": [ + "https://open-std.org/a.pdf", + "https://open-std.org/b.pdf", + ], + } + headers = m_post.call_args[1]["headers"] + assert headers["Authorization"] == "Bearer secret-token" + assert headers["Accept"] == "application/vnd.github+json" + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_no_new_papers(): + """No HTTP request when pipeline returns no new URLs.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_dispatch_disabled(): + """No HTTP request when WG21_GITHUB_DISPATCH_ENABLED is False.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(new_paper_urls=("https://x/y",)), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=False, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_from_date(): + """--from-date must be YYYY-MM.""" + with pytest.raises(CommandError, match="Invalid from_mailing_date"): + call_command(RUN_TRACKER_CMD, "--from-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-03", to_mailing_date=None) + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_to_date(): + with pytest.raises(CommandError, match="Invalid to_mailing_date"): + call_command(RUN_TRACKER_CMD, "--to-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_and_to_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-01", "--to-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-01", to_mailing_date="2025-03") diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 3b903fb..93b2158 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -3,9 +3,11 @@ from unittest.mock import patch, MagicMock import requests +from bs4 import BeautifulSoup from wg21_paper_tracker.fetcher import ( BASE_URL, + extract_paper_metadata_from_table_row, fetch_all_mailings, fetch_papers_for_mailing, ) @@ -204,3 +206,69 @@ def test_fetch_papers_for_mailing_calls_year_url(): ) fetch_papers_for_mailing("2025", "2025-01") m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) + + +# --- extract_paper_metadata_from_table_row --- + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_cells(): + """Empty cell list yields no paper.""" + assert extract_paper_metadata_from_table_row([], f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_paper_link(): + """Row without a matching paper href returns None.""" + html = "No link heret" + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + assert extract_paper_metadata_from_table_row(cells, f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_parses_legacy_five_column_row(): + """Older tables: Number, Title, Author, Document date, Subgroup (subgroup at index 4).""" + html = """ + + P1234R0 + My title + Author One, Author Two + 2025-03-15 + LEWG + + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2025/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1234r0" + assert result["type"] == "pdf" + assert result["filename"] == "p1234r0.pdf" + assert result["url"] == f"{BASE_URL}/2025/p1234r0.pdf" + assert result["title"] == "My title" + assert result["authors"] == ["Author One", "Author Two"] + assert result["document_date"] == "2025-03-15" + assert result["subgroup"] == "LEWG" + + +def test_extract_paper_metadata_from_table_row_parses_eight_column_row(): + """2026+ style: subgroup is column 7 (index 6), not index 4 (mailing date).""" + html = """ + + P1000R7 + C++ IS Schedule (proposed) + Herb Sutter + 2026-01-13 + 2026-01 + P1000R6 + All of WG21 + + + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2026/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1000r7" + assert result["document_date"] == "2026-01-13" + assert result["subgroup"] == "All of WG21" diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index ad56a29..099f105 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,111 +1,27 @@ """Tests for wg21_paper_tracker.pipeline.""" -from unittest.mock import patch, MagicMock +from unittest.mock import patch import pytest -import requests - -from wg21_paper_tracker.pipeline import ( - DOWNLOAD_TIMEOUT, - DOWNLOAD_MAX_RETRIES, - _download_file, - run_tracker_pipeline, -) - - -# --- _download_file --- - - -def test_download_file_success_text(tmp_path): - """_download_file saves text response and returns True.""" - url = "https://example.com/doc.html" - filepath = tmp_path / "doc.html" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/html; charset=utf-8"} - resp.content = b"Hello" - resp.apparent_encoding = "utf-8" - resp.iter_content = None - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): - result = _download_file(url, filepath) - assert result is True - assert filepath.read_text(encoding="utf-8") == "Hello" - - -def test_download_file_success_binary(tmp_path): - """_download_file saves binary response and returns True.""" - url = "https://example.com/doc.pdf" - filepath = tmp_path / "doc.pdf" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "application/pdf"} - resp.iter_content = lambda chunk_size: (b"\x25\x50\x44\x46",) - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): - result = _download_file(url, filepath) - assert result is True - assert filepath.read_bytes() == b"\x25\x50\x44\x46" - - -def test_download_file_uses_timeout(tmp_path): - """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" - url = "https://example.com/f" - filepath = tmp_path / "out" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/plain"} - resp.content = b"x" - resp.apparent_encoding = "utf-8" - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp) as m: - _download_file(url, filepath) - m.assert_called_once() - assert m.call_args[1]["timeout"] == DOWNLOAD_TIMEOUT - - -def test_download_file_retries_on_failure(tmp_path): - """_download_file retries up to DOWNLOAD_MAX_RETRIES then returns False.""" - url = "https://example.com/f" - filepath = tmp_path / "f" - with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = requests.RequestException("connection error") - with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: - result = _download_file(url, filepath) - assert result is False - assert m.call_count == DOWNLOAD_MAX_RETRIES - assert sleep_mock.call_count == DOWNLOAD_MAX_RETRIES - 1 - - -def test_download_file_succeeds_on_second_attempt(tmp_path): - """_download_file succeeds when a retry succeeds.""" - url = "https://example.com/f" - filepath = tmp_path / "f" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/plain"} - resp.content = b"ok" - resp.apparent_encoding = "utf-8" - with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = [requests.RequestException("first fail"), resp] - with patch("wg21_paper_tracker.pipeline.time.sleep"): - result = _download_file(url, filepath) - assert result is True - assert m.call_count == 2 - assert filepath.read_text() == "ok" + +from wg21_paper_tracker.pipeline import TrackerPipelineResult, run_tracker_pipeline # --- run_tracker_pipeline --- @pytest.mark.django_db -def test_run_tracker_pipeline_returns_zero_when_no_mailings(): - """run_tracker_pipeline returns 0 when fetch_all_mailings returns [].""" +def test_run_tracker_pipeline_returns_empty_when_no_mailings(): + """run_tracker_pipeline returns empty result when fetch_all_mailings returns [].""" with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]): - n = run_tracker_pipeline() - assert n == 0 + result = run_tracker_pipeline() + assert result.new_paper_count == 0 + assert result.new_paper_urls == () @pytest.mark.django_db def test_run_tracker_pipeline_skips_when_no_new_mailings(): - """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" + """run_tracker_pipeline returns empty when all mailings are <= latest in DB.""" from wg21_paper_tracker.models import WG21Mailing WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") @@ -117,13 +33,13 @@ def test_run_tracker_pipeline_skips_when_no_new_mailings(): with patch( "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[] ): - n = run_tracker_pipeline() - assert n == 0 + result = run_tracker_pipeline() + assert result.new_paper_count == 0 @pytest.mark.django_db -def test_run_tracker_pipeline_downloads_new_papers(tmp_path): - """run_tracker_pipeline downloads papers for new mailings and returns count.""" +def test_run_tracker_pipeline_collects_urls_for_new_papers(): + """run_tracker_pipeline returns URLs for papers created in this run.""" from wg21_paper_tracker.models import WG21Mailing WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") @@ -147,19 +63,159 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): with patch( "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers ): - with patch( - "wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path - ): - with patch( - "wg21_paper_tracker.pipeline._download_file", return_value=True - ): - with patch( - "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", - "test-bucket", - ): - with patch( - "wg21_paper_tracker.pipeline._upload_to_gcs", - return_value=True, - ): - n = run_tracker_pipeline() - assert n == 1 + result = run_tracker_pipeline() + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1000r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_from_mailing_date_backfills_older_than_db_latest(): + """from_mailing_date includes mailings >= date even when DB latest is newer.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Older", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest in DB", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1111r0", + "url": "https://example.com/p1111r0.pdf", + "filename": "p1111r0.pdf", + "title": "January paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + result = run_tracker_pipeline(from_mailing_date="2025-01") + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1111r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_second_run_no_new_urls(): + """Existing papers do not add URLs on a subsequent run.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + first = run_tracker_pipeline() + second = run_tracker_pipeline() + assert first.new_paper_count == 1 + assert second.new_paper_count == 0 + + +def test_tracker_pipeline_result_count(): + """TrackerPipelineResult.new_paper_count matches tuple length.""" + r = TrackerPipelineResult(new_paper_urls=("a", "b")) + assert r.new_paper_count == 2 + + +def test_run_tracker_pipeline_rejects_bad_from_mailing_date(): + """from_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid from_mailing_date"): + run_tracker_pipeline(from_mailing_date="not-valid") + + +def test_run_tracker_pipeline_rejects_bad_to_mailing_date(): + """to_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid to_mailing_date"): + run_tracker_pipeline(to_mailing_date="not-valid") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_rejects_from_after_to(): + with pytest.raises(ValueError, match="after"): + run_tracker_pipeline(from_mailing_date="2025-03", to_mailing_date="2025-01") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_mailing_date_caps_inclusive_range(): + """With from and to, mailings outside [from, to] are skipped.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-03", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Too early", "year": "2025"}, + {"mailing_date": "2025-02", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-03", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-04", "title": "Too late", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p2222r0", + "url": "https://example.com/p2222r0.pdf", + "filename": "p2222r0.pdf", + "title": "Feb", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline( + from_mailing_date="2025-02", to_mailing_date="2025-03" + ) + assert result.new_paper_count == 1 + assert fetch.call_count == 2 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_only_caps_incremental_above_latest(): + """to_mailing_date without from: still require mailing_date > latest_in_db.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Latest") + mailings = [ + {"mailing_date": "2025-01", "title": "Latest", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + {"mailing_date": "2025-03", "title": "Too new for cap", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p3333r0", + "url": "https://example.com/p3333r0.pdf", + "filename": "p3333r0.pdf", + "title": "A", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline(to_mailing_date="2025-02") + assert result.new_paper_count == 1 + assert fetch.call_count == 1 From 818dcafb0367d61c18a89a6b3994f67a97f5d464 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 21 Mar 2026 05:07:26 -0700 Subject: [PATCH 26/76] Remove migration #24 --- .../0005_alter_slackuser_slack_user_id.py | 18 ------------------ ...005_wg21paperauthorprofile_author_alias.py} | 2 +- dev-24error: | 0 requirements.txt | 2 +- wg21_paper_tracker/migrations/0001_initial.py | 2 +- 5 files changed, 3 insertions(+), 21 deletions(-) delete mode 100644 cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py rename cppa_user_tracker/migrations/{0006_wg21paperauthorprofile_author_alias.py => 0005_wg21paperauthorprofile_author_alias.py} (84%) delete mode 100644 dev-24error: diff --git a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py deleted file mode 100644 index f1cde2c..0000000 --- a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 4.2.28 on 2026-03-09 15:35 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('cppa_user_tracker', '0004_alter_slackuser_slack_user_id_and_more'), - ] - - operations = [ - migrations.AlterField( - model_name='slackuser', - name='slack_user_id', - field=models.CharField(max_length=64, unique=True), - ), - ] diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py similarity index 84% rename from cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py rename to cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py index 1660763..5623629 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py +++ b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"), ] operations = [ diff --git a/dev-24error: b/dev-24error: deleted file mode 100644 index e69de29..0000000 diff --git a/requirements.txt b/requirements.txt index 572af95..5e3ce95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,4 @@ PyGithub>=2.0 # cppa_pinecone_sync app pinecone>=3.0 langchain-core>=0.1 -langchain-text-splitters>=0.0.1 \ No newline at end of file +langchain-text-splitters>=0.0.1 diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index a2bbf3d..9c6b4d6 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -9,7 +9,7 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), ] operations = [ From 6748f28580bca3281551e989a04500e262f623b6 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 25 Mar 2026 02:04:30 -0400 Subject: [PATCH 27/76] unified issues/PR sync, backward commit pagination, Link-based REST - #125 --- clang_github_tracker/sync_raw.py | 57 ++- cppa_pinecone_sync/sync.py | 10 +- cppa_pinecone_sync/tests/test_sync.py | 7 +- github_activity_tracker/fetcher.py | 466 ++++++++++++------ github_activity_tracker/services.py | 4 +- github_activity_tracker/sync/__init__.py | 14 +- github_activity_tracker/sync/issues.py | 207 -------- .../sync/issues_and_prs.py | 370 ++++++++++++++ github_activity_tracker/sync/pull_requests.py | 235 --------- .../tests/test_client_link_parsing.py | 57 +++ github_activity_tracker/tests/test_fetcher.py | 157 +++--- .../tests/test_fetcher_commits_backward.py | 129 +++++ .../tests/test_fetcher_date_helpers.py | 106 ++++ .../tests/test_fetcher_issues_and_prs.py | 175 +++++++ github_activity_tracker/tests/test_sync.py | 44 +- .../tests/test_sync_issues_and_prs.py | 193 ++++++++ github_ops/client.py | 51 ++ 17 files changed, 1526 insertions(+), 756 deletions(-) delete mode 100644 github_activity_tracker/sync/issues.py create mode 100644 github_activity_tracker/sync/issues_and_prs.py delete mode 100644 github_activity_tracker/sync/pull_requests.py create mode 100644 github_activity_tracker/tests/test_client_link_parsing.py create mode 100644 github_activity_tracker/tests/test_fetcher_commits_backward.py create mode 100644 github_activity_tracker/tests/test_fetcher_date_helpers.py create mode 100644 github_activity_tracker/tests/test_fetcher_issues_and_prs.py create mode 100644 github_activity_tracker/tests/test_sync_issues_and_prs.py diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 8a38280..188836d 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -46,7 +46,8 @@ def _commit_date(commit_data: dict) -> datetime | None: def _issue_date(issue_data: dict) -> datetime | None: """Extract updated_at or created_at from GitHub issue payload. - Fetcher yields {issue_info: , comments: [...]}, so check nested first.""" + Fetcher yields {issue_info: , comments: [...]}, so check nested first. + """ info = issue_data.get("issue_info") or issue_data date_str = info.get("updated_at") or info.get("created_at") if not date_str: @@ -105,6 +106,12 @@ def sync_raw_only( latest_issue: datetime | None = None latest_pr: datetime | None = None + # Derive a single start date for the unified issue+PR fetch: earliest of the two. + if start_issue and start_pr: + start_item = max(start_issue, start_pr) + else: + start_item = start_issue or start_pr + try: # Commits for commit_data in fetcher.fetch_commits_from_github( @@ -120,35 +127,31 @@ def sync_raw_only( if latest_commit is not None: clang_state.save_state(last_commit_date=latest_commit, merge=True) - # Issues - for issue_data in fetcher.fetch_issues_from_github( - client, owner, repo, start_issue, end_date + # Issues and PRs — fetched together via a single /issues list call. + for item in fetcher.fetch_issues_and_prs_from_github( + client, owner, repo, start_item, end_date ): - issue_number = issue_data.get("number") or ( - issue_data.get("issue_info") or {} - ).get("number") - if issue_number is not None: - save_issue_raw_source(owner, repo, issue_data) - issue_numbers.append(issue_number) - dt = _issue_date(issue_data) - if dt and (latest_issue is None or dt > latest_issue): - latest_issue = dt + if "pr_info" in item: + pr_number = (item["pr_info"] or {}).get("number") + if pr_number is not None: + save_pr_raw_source(owner, repo, item) + pr_numbers.append(pr_number) + dt = _pr_date(item) + if dt and (latest_pr is None or dt > latest_pr): + latest_pr = dt + else: + issue_number = (item.get("issue_info") or {}).get("number") or item.get( + "number" + ) + if issue_number is not None: + save_issue_raw_source(owner, repo, item) + issue_numbers.append(issue_number) + dt = _issue_date(item) + if dt and (latest_issue is None or dt > latest_issue): + latest_issue = dt + if latest_issue is not None: clang_state.save_state(last_issue_date=latest_issue, merge=True) - - # PRs - for pr_data in fetcher.fetch_pull_requests_from_github( - client, owner, repo, start_pr, end_date - ): - pr_number = (pr_data.get("pr_info") or {}).get("number") or pr_data.get( - "number" - ) - if pr_number is not None: - save_pr_raw_source(owner, repo, pr_data) - pr_numbers.append(pr_number) - dt = _pr_date(pr_data) - if dt and (latest_pr is None or dt > latest_pr): - latest_pr = dt if latest_pr is not None: clang_state.save_state(last_pr_date=latest_pr, merge=True) diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 33e5f2e..d01f550 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -97,7 +97,9 @@ def _empty_sync_result() -> dict[str, Any]: } -def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: +def _build_documents_from_raw( + raw_documents: list[dict[str, Any]], +) -> list[Any]: """Convert preprocess output to langchain Documents; skip items missing doc_id/url.""" from langchain_core.documents import Document @@ -263,22 +265,16 @@ def sync_to_pinecone( ) if not raw_documents: - logger.info( "sync_to_pinecone: preprocess returned 0 documents for app_type=%s", app_type, ) - services.update_sync_status(app_type) - return _empty_sync_result() documents = _build_documents_from_raw(raw_documents) if not documents: - - services.update_sync_status(app_type) - return _empty_sync_result() attempted_source_ids = _extract_source_ids_from_documents(documents) diff --git a/cppa_pinecone_sync/tests/test_sync.py b/cppa_pinecone_sync/tests/test_sync.py index bd22395..a72e62c 100644 --- a/cppa_pinecone_sync/tests/test_sync.py +++ b/cppa_pinecone_sync/tests/test_sync.py @@ -161,7 +161,7 @@ def test_extract_new_failed_ids_skips_empty(): @pytest.mark.django_db def test_sync_to_pinecone_empty_preprocess_returns_early(app_type): - """sync_to_pinecone returns empty result and updates status when preprocess returns no docs.""" + """sync_to_pinecone returns empty result and does not update sync status when preprocess returns no docs.""" def preprocess(_failed_ids, _final_sync_at): return [], False @@ -170,12 +170,12 @@ def preprocess(_failed_ids, _final_sync_at): assert result["upserted"] == 0 assert result["total"] == 0 assert result["failed_ids"] == [] - assert services.get_final_sync_at(app_type) is not None + assert services.get_final_sync_at(app_type) is None @pytest.mark.django_db def test_sync_to_pinecone_all_invalid_docs_returns_early(app_type): - """sync_to_pinecone returns empty result when all raw docs lack doc_id/url.""" + """sync_to_pinecone returns empty result and does not update sync status when all raw docs lack doc_id/url.""" def preprocess(_failed_ids, _final_sync_at): return [ @@ -185,6 +185,7 @@ def preprocess(_failed_ids, _final_sync_at): result = sync_to_pinecone(app_type, "ns", preprocess) assert result["upserted"] == 0 assert result["total"] == 0 + assert services.get_final_sync_at(app_type) is None @pytest.mark.django_db diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index 5621c76..deef12b 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -9,6 +9,7 @@ import time from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Iterator, Optional +from urllib.parse import parse_qs, urlparse import requests @@ -18,6 +19,24 @@ logger = logging.getLogger(__name__) +def _make_aware(dt: datetime) -> datetime: + """Return dt as UTC-aware; if naive, assume UTC.""" + return dt.replace(tzinfo=timezone.utc) if dt.tzinfo is None else dt + + +def _in_date_range( + dt: datetime, + start_time: Optional[datetime], + end_time: Optional[datetime], +) -> bool: + """Return True if dt falls within [start_time, end_time] (UTC-aware, both inclusive).""" + if start_time and dt < _make_aware(start_time): + return False + if end_time and dt > _make_aware(end_time): + return False + return True + + def fetch_user_from_github( client: GitHubAPIClient, username: str = "", @@ -42,6 +61,54 @@ def fetch_user_from_github( return None +def _is_first_page_url(url: str) -> bool: + """Return True if the URL's page= query param is 1 or absent (GitHub default).""" + try: + pages = parse_qs(urlparse(url).query).get("page") + return int(pages[0]) == 1 if pages else True + except (ValueError, IndexError): + return False + + +def _yield_commit_with_stats( + client: GitHubAPIClient, + owner: str, + repo: str, + commit: dict, + start_time: Optional[datetime], + end_time: Optional[datetime], +) -> Iterator[dict]: + """Filter a single commit list entry by date range, fetch full stats, and yield.""" + commit_date_str = commit.get("commit", {}).get("author", {}).get( + "date" + ) or commit.get("commit", {}).get("committer", {}).get("date") + if commit_date_str: + try: + commit_dt = datetime.fromisoformat(commit_date_str.replace("Z", "+00:00")) + if not _in_date_range(commit_dt, start_time, end_time): + return + except Exception as e: + logger.debug("Failed to parse commit date '%s': %s", commit_date_str, e) + + try: + commit_with_stats = client.rest_request( + f"/repos/{owner}/{repo}/commits/{commit['sha']}" + ) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code in (502, 503, 504): + logger.warning( + "Aborting commit sync at %s for %s/%s after HTTP %s: %s", + commit["sha"][:7], + owner, + repo, + e.response.status_code, + e, + ) + raise + raise + yield commit_with_stats + + def fetch_commits_from_github( client: GitHubAPIClient, owner: str, @@ -50,110 +117,97 @@ def fetch_commits_from_github( end_time: Optional[datetime] = None, etag_cache: Optional[Any] = None, ) -> Iterator[dict]: - """Fetch commits from GitHub API (paginated). Yields commit dicts with stats. - If etag_cache is provided, uses rest_request_conditional for the list GET. + """Fetch commits from GitHub API oldest-to-newest using Link header backward traversal. + + Fetches page 1 to discover the last page via the Link header, then walks backward + (last → prev → … → page 1), yielding commits in chronological order (oldest first) + within each page by reversing the newest-first GitHub default. + + The page-1 list response is cached in memory so when backward traversal returns to + page 1 via the "prev" link, no duplicate request is made. + + If etag_cache is provided, a conditional GET is used for page 1; a 304 means no + new commits exist in the requested date window and the function returns immediately. """ - logger.debug(f"Fetching commits for {owner}/{repo} from {start_time} to {end_time}") - page = 1 + logger.debug( + "Fetching commits for %s/%s from %s to %s", owner, repo, start_time, end_time + ) + per_page = 100 since_iso = start_time.isoformat() if start_time else "" until_iso = end_time.isoformat() if end_time else "" - - while True: - params = { - "per_page": per_page, - "page": page, - } - if start_time: - params["since"] = start_time.isoformat() - if end_time: - params["until"] = end_time.isoformat() - - response_etag = None - if etag_cache is not None: - etag = etag_cache.get("commits", page, since_iso, until_iso) - data, response_etag = client.rest_request_conditional( - f"/repos/{owner}/{repo}/commits", params=params, etag=etag + endpoint = f"/repos/{owner}/{repo}/commits" + + params: dict = {"per_page": per_page, "page": 1} + if start_time: + params["since"] = start_time.isoformat() + if end_time: + params["until"] = end_time.isoformat() + + # Fetch page 1 to discover total pages via Link header. + first_page_etag: Optional[str] = None + if etag_cache is not None: + etag = etag_cache.get("commits", 1, since_iso, until_iso) + first_page_data, first_page_etag, first_page_links = ( + client.rest_request_conditional_with_all_links( + endpoint, params=params, etag=etag ) - if data is None: - logger.debug("Commits list page %s: 304 Not Modified, skipping", page) - page += 1 - time.sleep(0.2) - continue - commits = data - else: - commits = client.rest_request(f"/repos/{owner}/{repo}/commits", params) - - if not commits: - logger.debug(f"No more commits found at page {page}") - break - logger.debug(f"Fetched {len(commits)} commits from page {page}") - - for commit in reversed(commits): - commit_date_str = commit.get("commit", {}).get("author", {}).get( - "date" - ) or commit.get("commit", {}).get("committer", {}).get("date") - if commit_date_str: - try: - commit_dt = datetime.fromisoformat( - commit_date_str.replace("Z", "+00:00") - ) + ) + if first_page_data is None: + logger.debug("Commits list page 1: 304 Not Modified, nothing to process") + return + else: + first_page_data, first_page_links = client.rest_request_with_all_links( + endpoint, params + ) - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if commit_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if commit_dt > end_time_aware: - continue - except Exception as e: - logger.debug( - f"Failed to parse commit date '{commit_date_str}': {e}" - ) + if not first_page_data: + logger.debug("No commits found for %s/%s", owner, repo) + return - # Fetch full commit with stats (abort on 502/503/504 so page is not checkpointed and can be retried) - try: - commit_with_stats = client.rest_request( - f"/repos/{owner}/{repo}/commits/{commit['sha']}" - ) - except requests.exceptions.HTTPError as e: - if e.response is not None and e.response.status_code in ( - 502, - 503, - 504, - ): - logger.warning( - "Aborting commit sync at %s for %s/%s after HTTP %s: %s", - commit["sha"][:7], - owner, - repo, - e.response.status_code, - e, - ) - raise - raise - yield commit_with_stats + logger.debug( + "Fetched %d commits on page 1 for %s/%s", len(first_page_data), owner, repo + ) - if etag_cache is not None and response_etag: - etag_cache.set("commits", page, since_iso, until_iso, response_etag) + last_url = first_page_links.get("last") - if len(commits) < per_page: + if not last_url or _is_first_page_url(last_url): + # Single page: reverse to yield oldest-first, then cache ETag and return. + logger.debug("Single page of commits; processing in reverse order") + for commit in reversed(first_page_data): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) + return + + # Multiple pages: walk backward from last page to page 1, yielding oldest-first. + current_url: Optional[str] = last_url + while current_url is not None: + if _is_first_page_url(current_url): + # Reuse the already-fetched page-1 data — no extra API request. + page_data = first_page_data + page_links = first_page_links + logger.debug("Backward traversal reached page 1; using cached data") + else: + page_data, page_links = client.rest_request_url_with_all_links(current_url) logger.debug( - f"Last page reached (got {len(commits)} commits, expected {per_page})" + "Fetched %d commits (backward traversal) from %s", + len(page_data) if page_data else 0, + current_url, ) - break - page += 1 - time.sleep(0.2) + time.sleep(0.2) + + for commit in reversed(page_data or []): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + + current_url = page_links.get("prev") + + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) def fetch_comments_from_github( @@ -200,26 +254,11 @@ def fetch_comments_from_github( if created_str: try: c_dt = datetime.fromisoformat(created_str.replace("Z", "+00:00")) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if c_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if c_dt > end_time_aware: - continue + if not _in_date_range(c_dt, start_time, end_time): + continue except Exception as e: logger.debug(f"Failed to parse comment date '{created_str}': {e}") + continue results.append(comment) @@ -318,22 +357,8 @@ def fetch_issues_from_github( logger.debug(f"Failed to parse issue date '{updated_str}': {e}") continue - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if issue_dt < start_time_aware: - continue - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if issue_dt > end_time_aware: - continue + if not _in_date_range(issue_dt, start_time, end_time): + continue issue_number = issue.get("number") if issue_number is not None: @@ -402,26 +427,11 @@ def fetch_pr_reviews_from_github( review_dt = datetime.fromisoformat( updated_str.replace("Z", "+00:00") ) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if review_dt < start_time_aware: - continue - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if review_dt > end_time_aware: - continue + if not _in_date_range(review_dt, start_time, end_time): + continue except Exception as e: logger.debug(f"Failed to parse review date '{updated_str}': {e}") + continue results.append(review) @@ -439,6 +449,148 @@ def fetch_pr_reviews_from_github( return results +def fetch_issues_and_prs_from_github( + client: GitHubAPIClient, + owner: str, + repo: str, + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + etag_cache: Optional[Any] = None, +) -> Iterator[dict]: + """Fetch issues and PRs from GitHub using a single /issues list endpoint. + + GitHub's issues API returns both issues and pull requests; this function routes each + item by the presence of the "pull_request" key: + - Issues → yield {"issue_info": , "comments": [...]} + - PRs → yield {"pr_info": , "comments": [...], "reviews": [...]} + + Uses Link-header pagination (direction=asc, sort=updated) so items are processed + oldest-updated-first. If etag_cache is provided, uses conditional GET for the first + page; a 304 means nothing has changed and the function returns immediately. + """ + logger.debug( + "Fetching issues+PRs for %s/%s from %s to %s", owner, repo, start_time, end_time + ) + per_page = 100 + since_iso = start_time.isoformat() if start_time else "" + endpoint = f"/repos/{owner}/{repo}/issues" + next_url: Optional[str] = None + page_num = 1 + + while True: + response_etag: Optional[str] = None + try: + if next_url is not None: + items, next_url = client.rest_request_url(next_url) + page_num += 1 + else: + params: dict = { + "state": "all", + "per_page": per_page, + "page": page_num, + "sort": "updated", + "direction": "asc", + } + if start_time: + params["since"] = start_time.isoformat() + if etag_cache is not None: + etag = etag_cache.get("issues_and_prs", page_num, since_iso, "") + data, response_etag, next_url = ( + client.rest_request_conditional_with_link( + endpoint, params=params, etag=etag + ) + ) + if data is None: + logger.debug( + "Issues+PRs list page %s: 304 Not Modified, skipping", + page_num, + ) + page_num += 1 + time.sleep(0.2) + continue + items = data + else: + items, next_url = client.rest_request_with_link(endpoint, params) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code == 422: + logger.debug( + "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" + ) + break + raise + + if not items: + logger.debug("No more issues/PRs found") + break + + logger.debug( + "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num + ) + + for item in items: + updated_str = item.get("updated_at") or item.get("created_at") + if not updated_str: + continue + try: + item_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) + except (ValueError, TypeError) as e: + logger.debug("Failed to parse item date '%s': %s", updated_str, e) + continue + + if not _in_date_range(item_dt, start_time, end_time): + continue + + number = item.get("number") + if number is None: + continue + + if "pull_request" in item: + # PR: fetch full detail from /pulls endpoint, then comments + reviews. + try: + full_pr = client.rest_request( + f"/repos/{owner}/{repo}/pulls/{number}" + ) + if full_pr and isinstance(full_pr, dict): + item = full_pr + except Exception as e: + logger.debug("Failed to fetch full PR #%s: %s", number, e) + logger.debug("Fetching comments for PR #%s", number) + comments = fetch_comments_from_github( + client, owner, repo, number, start_time, end_time + ) + time.sleep(0.2) + logger.debug("Fetching reviews for PR #%s", number) + reviews = fetch_pr_reviews_from_github( + client, owner, repo, number, start_time, end_time + ) + time.sleep(0.2) + yield {"pr_info": item, "comments": comments, "reviews": reviews} + else: + # Issue: fetch full detail from /issues endpoint, then comments. + try: + full_issue = client.rest_request( + f"/repos/{owner}/{repo}/issues/{number}" + ) + if full_issue and isinstance(full_issue, dict): + item = full_issue + except Exception as e: + logger.debug("Failed to fetch full issue #%s: %s", number, e) + logger.debug("Fetching comments for issue #%s", number) + comments = fetch_comments_from_github( + client, owner, repo, number, start_time, end_time + ) + logger.debug("Found %d comments for issue #%s", len(comments), number) + yield {"issue_info": item, "comments": comments} + + if etag_cache is not None and response_etag: + etag_cache.set("issues_and_prs", page_num, since_iso, "", response_etag) + + if next_url is None: + logger.debug('Last page reached (no Link rel="next")') + break + time.sleep(0.2) + + def fetch_pull_requests_from_github( client: GitHubAPIClient, owner: str, @@ -489,25 +641,11 @@ def fetch_pull_requests_from_github( if updated_str: try: pr_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) - - if start_time: - start_time_aware = ( - start_time.replace(tzinfo=timezone.utc) - if start_time.tzinfo is None - else start_time - ) - if pr_dt < start_time_aware: - flag = True - break - - if end_time: - end_time_aware = ( - end_time.replace(tzinfo=timezone.utc) - if end_time.tzinfo is None - else end_time - ) - if pr_dt > end_time_aware: - continue + if start_time and pr_dt < _make_aware(start_time): + flag = True + break + if end_time and pr_dt > _make_aware(end_time): + continue except Exception as e: logger.debug("Failed to parse PR date '%s': %s", updated_str, e) continue diff --git a/github_activity_tracker/services.py b/github_activity_tracker/services.py index 41a8d80..041be69 100644 --- a/github_activity_tracker/services.py +++ b/github_activity_tracker/services.py @@ -196,7 +196,7 @@ def create_or_update_commit( commit_at: Optional[datetime] = None, ) -> tuple[GitCommit, bool]: """Create or update a GitCommit by repo + commit_hash. Returns (commit, created).""" - from datetime import datetime, timezone + from datetime import datetime if not commit_at: commit_at = datetime.now(timezone.utc) @@ -377,7 +377,6 @@ def create_or_update_issue( issue_obj.issue_created_at = issue_created_at issue_obj.issue_updated_at = issue_updated_at issue_obj.issue_closed_at = issue_closed_at - issue_obj.updated_at = datetime.now(timezone.utc) issue_obj.save() return issue_obj, created @@ -465,7 +464,6 @@ def create_or_update_pull_request( pr_obj.pr_updated_at = pr_updated_at pr_obj.pr_merged_at = pr_merged_at pr_obj.pr_closed_at = pr_closed_at - pr_obj.updated_at = datetime.now(timezone.utc) pr_obj.save() return pr_obj, created diff --git a/github_activity_tracker/sync/__init__.py b/github_activity_tracker/sync/__init__.py index c7e0f18..481a607 100644 --- a/github_activity_tracker/sync/__init__.py +++ b/github_activity_tracker/sync/__init__.py @@ -1,7 +1,7 @@ """ GitHub sync package: read last updated from DB, fetch from GitHub, save via services. -Split by entity: repos, commits, issues, pull_requests. +Split by entity: repos, commits, issues_and_prs. Entry point: sync_github(repo) runs all in order for that repo. Accepts GitHubRepository or any subclass (e.g. BoostLibraryRepository); base fields are used. """ @@ -12,8 +12,7 @@ from typing import TYPE_CHECKING, Optional from .commits import sync_commits -from .issues import sync_issues -from .pull_requests import sync_pull_requests +from .issues_and_prs import sync_issues_and_prs from .repos import sync_repos if TYPE_CHECKING: @@ -25,7 +24,10 @@ def sync_github( start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ) -> dict[str, list[int]]: - """Run full sync for one repo: repos (metadata), then commits, issues, pull requests. + """Run full sync for one repo: repos (metadata), then commits, issues and pull requests. + + Issues and PRs are fetched together via a single GitHub /issues list call which + returns both; items are routed internally by the presence of a "pull_request" key. Accepts GitHubRepository or a subclass (e.g. BoostLibraryRepository); the same base row is used, so extended models can be passed and sync will work. @@ -41,6 +43,4 @@ def sync_github( """ sync_repos(repo) sync_commits(repo, start_date=start_date, end_date=end_date) - issue_numbers = sync_issues(repo, start_date=start_date, end_date=end_date) - pr_numbers = sync_pull_requests(repo, start_date=start_date, end_date=end_date) - return {"issues": issue_numbers, "pull_requests": pr_numbers} + return sync_issues_and_prs(repo, start_date=start_date, end_date=end_date) diff --git a/github_activity_tracker/sync/issues.py b/github_activity_tracker/sync/issues.py deleted file mode 100644 index 6cf3e22..0000000 --- a/github_activity_tracker/sync/issues.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Sync GitHub issues (comments, assignees, labels) with the database. - -Flow: -1. Process existing JSON files in workspace///issues/*.json (load → DB → remove file). -2. Fetch from GitHub, save each as issues/.json, persist to DB, then remove the file. -""" - -from __future__ import annotations - -import json -import logging -from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Optional - -from cppa_user_tracker.services import get_or_create_github_account -from github_activity_tracker import fetcher, services -from .raw_source import save_issue_raw_source -from .etag_cache import RedisListETagCache -from github_activity_tracker.workspace import ( - get_issue_json_path, - iter_existing_issue_jsons, -) -from github_ops import get_github_client -from github_ops.client import ConnectionException, RateLimitException -from github_activity_tracker.sync.utils import ( - normalize_issue_json, - parse_datetime, - parse_github_user, -) - -if TYPE_CHECKING: - from github_activity_tracker.models import GitHubRepository - -logger = logging.getLogger(__name__) - - -def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: - """Apply one issue dict (with comments, assignees, labels) to the database. - Accepts flat or nested { issue_info, comments } format.""" - issue_data = normalize_issue_json(issue_data) - user_info = parse_github_user(issue_data.get("user")) - if not user_info["account_id"]: - logger.warning( - "Issue #%s: no user account_id; skipping", - issue_data.get("number", "?"), - ) - return - account, _ = get_or_create_github_account( - github_account_id=user_info["account_id"], - username=user_info["username"], - display_name=user_info["display_name"], - avatar_url=user_info["avatar_url"], - ) - - issue_obj, _ = services.create_or_update_issue( - repo=repo, - account=account, - issue_number=issue_data.get("number"), - issue_id=issue_data.get("id"), - title=issue_data.get("title", ""), - body=issue_data.get("body", ""), - state=issue_data.get("state", "open"), - state_reason=issue_data.get("state_reason", ""), - issue_created_at=parse_datetime(issue_data.get("created_at")), - issue_updated_at=parse_datetime(issue_data.get("updated_at")), - issue_closed_at=parse_datetime(issue_data.get("closed_at")), - ) - - for comment_data in issue_data.get("comments", []): - comment_user_info = parse_github_user(comment_data.get("user")) - if comment_user_info["account_id"]: - comment_account, _ = get_or_create_github_account( - github_account_id=comment_user_info["account_id"], - username=comment_user_info["username"], - display_name=comment_user_info["display_name"], - avatar_url=comment_user_info["avatar_url"], - ) - services.create_or_update_issue_comment( - issue=issue_obj, - account=comment_account, - issue_comment_id=comment_data.get("id"), - body=comment_data.get("body", ""), - issue_comment_created_at=parse_datetime(comment_data.get("created_at")), - issue_comment_updated_at=parse_datetime(comment_data.get("updated_at")), - ) - - assignee_infos = [parse_github_user(a) for a in issue_data.get("assignees", [])] - current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} - for assignee_account in issue_obj.assignees.all(): - if assignee_account.github_account_id not in current_assignee_ids: - services.remove_issue_assignee(issue_obj, assignee_account) - for assignee_info in assignee_infos: - if assignee_info["account_id"]: - assignee_account, _ = get_or_create_github_account( - github_account_id=assignee_info["account_id"], - username=assignee_info["username"], - display_name=assignee_info["display_name"], - avatar_url=assignee_info["avatar_url"], - ) - services.add_issue_assignee(issue_obj, assignee_account) - - for label_data in issue_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_issue_label(issue_obj, label_name) - - logger.debug("Issue #%s: saved to DB", issue_data.get("number")) - - -def _process_existing_issue_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: - """Load each issues/*.json in workspace for this repo, save to DB, remove file. - - Returns: - (count, issue_numbers) — count of processed files and their issue numbers. - """ - owner = repo.owner_account.username - repo_name = repo.repo_name - count = 0 - numbers: list[int] = [] - for path in iter_existing_issue_jsons(owner, repo_name): - try: - data = json.loads(path.read_text(encoding="utf-8")) - _process_issue_data(repo, data) - save_issue_raw_source(owner, repo_name, data) - path.unlink() - number = (data.get("issue_info") or {}).get("number") or data.get("number") - if number is not None: - numbers.append(number) - count += 1 - except Exception as e: - logger.exception("Failed to process %s: %s", path, e) - return count, numbers - - -def sync_issues( - repo: GitHubRepository, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, -) -> list[int]: - """1) Process existing workspace JSONs; 2) Fetch from GitHub, save as JSON, persist to DB, remove file. - - Args: - repo: Repository to sync. - start_date: Override start date (default: last issue updated_at + 1s, or None if no issues). - end_date: Override end date (default: None = no end; stable ETag cache). - - Returns: - List of issue numbers processed during this sync run. - """ - logger.info("sync_issues: starting for repo id=%s (%s)", repo.pk, repo.repo_name) - - owner = repo.owner_account.username - repo_name = repo.repo_name - processed_numbers: list[int] = [] - - try: - # Phase 1: process existing JSON files - n_existing, existing_numbers = _process_existing_issue_jsons(repo) - processed_numbers.extend(existing_numbers) - if n_existing: - logger.info("sync_issues: processed %s existing issue JSON(s)", n_existing) - - # Phase 2: fetch from GitHub, write JSON, persist to DB, remove file - client = get_github_client() - if start_date is None: - last_issue = repo.issues.order_by("-issue_updated_at").first() - if last_issue: - start_date = last_issue.issue_updated_at + timedelta(seconds=1) - # Leave end_date as None when not set so ETag cache semantics stay stable. - - count = 0 - etag_cache = RedisListETagCache(repo_id=repo.pk) - for issue_data in fetcher.fetch_issues_from_github( - client, owner, repo_name, start_date, end_date, etag_cache=etag_cache - ): - issue_number = (issue_data.get("issue_info") or {}).get( - "number" - ) or issue_data.get("number") - if issue_number is None: - continue - json_path = get_issue_json_path(owner, repo_name, issue_number) - json_path.parent.mkdir(parents=True, exist_ok=True) - json_path.write_text( - json.dumps(issue_data, indent=2, default=str), encoding="utf-8" - ) - _process_issue_data(repo, issue_data) - save_issue_raw_source(owner, repo_name, issue_data) - json_path.unlink() - processed_numbers.append(issue_number) - count += 1 - - logger.info( - "sync_issues: finished for repo id=%s; %s existing + %s fetched", - repo.pk, - n_existing, - count, - ) - - except (RateLimitException, ConnectionException) as e: - logger.error("sync_issues: failed for repo id=%s: %s", repo.pk, e) - raise - except Exception as e: - logger.exception("sync_issues: unexpected error for repo id=%s: %s", repo.pk, e) - raise - - return processed_numbers diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py new file mode 100644 index 0000000..71f3810 --- /dev/null +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -0,0 +1,370 @@ +""" +Sync GitHub issues and pull requests together using a single /issues list API call. + +Flow: +1. Process existing JSON files in workspace///issues/*.json and prs/*.json. +2. Fetch from GitHub via fetch_issues_and_prs_from_github (one endpoint, routes by key). + For each item: save as JSON, persist to DB, remove file. +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timedelta +from typing import TYPE_CHECKING, Optional + +from cppa_user_tracker.services import get_or_create_github_account +from github_activity_tracker import fetcher, services +from github_activity_tracker.sync.etag_cache import RedisListETagCache +from github_activity_tracker.sync.raw_source import save_issue_raw_source, save_pr_raw_source +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, + parse_datetime, + parse_github_user, +) +from github_activity_tracker.workspace import ( + get_issue_json_path, + get_pr_json_path, + iter_existing_issue_jsons, + iter_existing_pr_jsons, +) +from github_ops import get_github_client +from github_ops.client import ConnectionException, RateLimitException + +if TYPE_CHECKING: + from github_activity_tracker.models import GitHubRepository + +logger = logging.getLogger(__name__) + + +def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: + """Apply one issue dict (with comments, assignees, labels) to the database. + Accepts flat or nested { issue_info, comments } format.""" + issue_data = normalize_issue_json(issue_data) + user_info = parse_github_user(issue_data.get("user")) + if not user_info["account_id"]: + logger.warning( + "Issue #%s: no user account_id; skipping", + issue_data.get("number", "?"), + ) + return + account, _ = get_or_create_github_account( + github_account_id=user_info["account_id"], + username=user_info["username"], + display_name=user_info["display_name"], + avatar_url=user_info["avatar_url"], + ) + + issue_obj, _ = services.create_or_update_issue( + repo=repo, + account=account, + issue_number=issue_data.get("number"), + issue_id=issue_data.get("id"), + title=issue_data.get("title", ""), + body=issue_data.get("body", ""), + state=issue_data.get("state", "open"), + state_reason=issue_data.get("state_reason", ""), + issue_created_at=parse_datetime(issue_data.get("created_at")), + issue_updated_at=parse_datetime(issue_data.get("updated_at")), + issue_closed_at=parse_datetime(issue_data.get("closed_at")), + ) + + for comment_data in issue_data.get("comments", []): + comment_user_info = parse_github_user(comment_data.get("user")) + if comment_user_info["account_id"]: + comment_account, _ = get_or_create_github_account( + github_account_id=comment_user_info["account_id"], + username=comment_user_info["username"], + display_name=comment_user_info["display_name"], + avatar_url=comment_user_info["avatar_url"], + ) + services.create_or_update_issue_comment( + issue=issue_obj, + account=comment_account, + issue_comment_id=comment_data.get("id"), + body=comment_data.get("body", ""), + issue_comment_created_at=parse_datetime(comment_data.get("created_at")), + issue_comment_updated_at=parse_datetime(comment_data.get("updated_at")), + ) + + assignee_infos = [parse_github_user(a) for a in issue_data.get("assignees", [])] + current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} + for assignee_account in issue_obj.assignees.all(): + if assignee_account.github_account_id not in current_assignee_ids: + services.remove_issue_assignee(issue_obj, assignee_account) + for assignee_info in assignee_infos: + if assignee_info["account_id"]: + assignee_account, _ = get_or_create_github_account( + github_account_id=assignee_info["account_id"], + username=assignee_info["username"], + display_name=assignee_info["display_name"], + avatar_url=assignee_info["avatar_url"], + ) + services.add_issue_assignee(issue_obj, assignee_account) + + for label_data in issue_data.get("labels", []): + label_name = label_data.get("name", "") + if label_name: + services.add_issue_label(issue_obj, label_name) + + logger.debug("Issue #%s: saved to DB", issue_data.get("number")) + + +def _process_existing_issue_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: + """Load each issues/*.json in workspace for this repo, save to DB, remove file. + + Returns: + (count, issue_numbers) — count of processed files and their issue numbers. + """ + owner = repo.owner_account.username + repo_name = repo.repo_name + count = 0 + numbers: list[int] = [] + for path in iter_existing_issue_jsons(owner, repo_name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + _process_issue_data(repo, data) + save_issue_raw_source(owner, repo_name, data) + path.unlink() + number = (data.get("issue_info") or {}).get("number") or data.get("number") + if number is not None: + numbers.append(number) + count += 1 + except Exception as e: + logger.exception("Failed to process %s: %s", path, e) + return count, numbers + + +def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: + """Apply one PR dict (with comments, reviews, assignees, labels) to the database. + Accepts flat or nested { pr_info, comments, reviews } format.""" + pr_data = normalize_pr_json(pr_data) + user_info = parse_github_user(pr_data.get("user")) + if not user_info["account_id"]: + logger.warning( + "PR #%s: no user account_id; skipping", + pr_data.get("number", "?"), + ) + return + account, _ = get_or_create_github_account( + github_account_id=user_info["account_id"], + username=user_info["username"], + display_name=user_info["display_name"], + avatar_url=user_info["avatar_url"], + ) + + pr_obj, _ = services.create_or_update_pull_request( + repo=repo, + account=account, + pr_number=pr_data.get("number"), + pr_id=pr_data.get("id"), + title=pr_data.get("title", ""), + body=pr_data.get("body", ""), + state=pr_data.get("state", "open"), + head_hash=pr_data.get("head", {}).get("sha", ""), + base_hash=pr_data.get("base", {}).get("sha", ""), + pr_created_at=parse_datetime(pr_data.get("created_at")), + pr_updated_at=parse_datetime(pr_data.get("updated_at")), + pr_merged_at=parse_datetime(pr_data.get("merged_at")), + pr_closed_at=parse_datetime(pr_data.get("closed_at")), + ) + + for comment_data in pr_data.get("comments", []): + comment_user_info = parse_github_user(comment_data.get("user")) + if comment_user_info["account_id"]: + comment_account, _ = get_or_create_github_account( + github_account_id=comment_user_info["account_id"], + username=comment_user_info["username"], + display_name=comment_user_info["display_name"], + avatar_url=comment_user_info["avatar_url"], + ) + services.create_or_update_pr_comment( + pr=pr_obj, + account=comment_account, + pr_comment_id=comment_data.get("id"), + body=comment_data.get("body", ""), + pr_comment_created_at=parse_datetime(comment_data.get("created_at")), + pr_comment_updated_at=parse_datetime(comment_data.get("updated_at")), + ) + + for review_data in pr_data.get("reviews", []): + review_user_info = parse_github_user(review_data.get("user")) + if review_user_info["account_id"]: + review_account, _ = get_or_create_github_account( + github_account_id=review_user_info["account_id"], + username=review_user_info["username"], + display_name=review_user_info["display_name"], + avatar_url=review_user_info["avatar_url"], + ) + services.create_or_update_pr_review( + pr=pr_obj, + account=review_account, + pr_review_id=review_data.get("id"), + body=review_data.get("body", ""), + in_reply_to_id=review_data.get("in_reply_to_id"), + pr_review_created_at=parse_datetime(review_data.get("created_at")), + pr_review_updated_at=parse_datetime(review_data.get("updated_at")), + ) + + for assignee_data in pr_data.get("assignees", []): + assignee_info = parse_github_user(assignee_data) + if assignee_info["account_id"]: + assignee_account, _ = get_or_create_github_account( + github_account_id=assignee_info["account_id"], + username=assignee_info["username"], + display_name=assignee_info["display_name"], + avatar_url=assignee_info["avatar_url"], + ) + services.add_pr_assignee(pr_obj, assignee_account) + + for label_data in pr_data.get("labels", []): + label_name = label_data.get("name", "") + if label_name: + services.add_pull_request_label(pr_obj, label_name) + + logger.debug("PR #%s: saved to DB", pr_data.get("number")) + + +def _process_existing_pr_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: + """Load each prs/*.json in workspace for this repo, save to DB, remove file. + + Returns: + (count, pr_numbers) — count of processed files and their PR numbers. + """ + owner = repo.owner_account.username + repo_name = repo.repo_name + count = 0 + numbers: list[int] = [] + for path in iter_existing_pr_jsons(owner, repo_name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + _process_pr_data(repo, data) + save_pr_raw_source(owner, repo_name, data) + path.unlink() + number = (data.get("pr_info") or {}).get("number") or data.get("number") + if number is not None: + numbers.append(number) + count += 1 + except Exception as e: + logger.exception("Failed to process %s: %s", path, e) + return count, numbers + + +def sync_issues_and_prs( + repo: GitHubRepository, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, +) -> dict[str, list[int]]: + """Sync issues and PRs for a repo using a single GitHub /issues list call. + + 1. Process any existing issue/PR JSON files left from a previous interrupted run. + 2. Determine the start date as the earliest of the last-seen issue and PR update times. + 3. Fetch items via fetch_issues_and_prs_from_github; each item is routed by key: + - "issue_info" → persisted as an issue + - "pr_info" → persisted as a pull request + + Args: + repo: Repository to sync. + start_date: Override start date (default: derived from DB; see below). + end_date: Override end date (default: None = no upper bound). + + Returns: + {"issues": [], "pull_requests": []} + """ + logger.info( + "sync_issues_and_prs: starting for repo id=%s (%s)", repo.pk, repo.repo_name + ) + + owner = repo.owner_account.username + repo_name = repo.repo_name + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] + + try: + # Phase 1: process any JSON files left from a previous interrupted run. + n_issues, existing_issue_nums = _process_existing_issue_jsons(repo) + issue_numbers.extend(existing_issue_nums) + n_prs, existing_pr_nums = _process_existing_pr_jsons(repo) + pr_numbers.extend(existing_pr_nums) + if n_issues or n_prs: + logger.info( + "sync_issues_and_prs: processed %s existing issue JSON(s), %s existing PR JSON(s)", + n_issues, + n_prs, + ) + + # Phase 2: determine start date from the earliest of last issue / last PR update. + if start_date is None: + last_issue = repo.issues.order_by("-issue_updated_at").first() + last_pr = repo.pull_requests.order_by("-pr_updated_at").first() + + issue_date = (last_issue.issue_updated_at + timedelta(seconds=1)) if last_issue else None + pr_date = (last_pr.pr_updated_at + timedelta(seconds=1)) if last_pr else None + + if issue_date and pr_date: + start_date = min(issue_date, pr_date) + else: + start_date = issue_date or pr_date + + # Phase 3: fetch from GitHub, write JSON, persist to DB, remove file. + client = get_github_client() + etag_cache = RedisListETagCache(repo_id=repo.pk) + count_issues = 0 + count_prs = 0 + + for item in fetcher.fetch_issues_and_prs_from_github( + client, owner, repo_name, start_date, end_date, etag_cache=etag_cache + ): + if "pr_info" in item: + pr_number = (item["pr_info"] or {}).get("number") + if pr_number is None: + continue + json_path = get_pr_json_path(owner, repo_name, pr_number) + json_path.parent.mkdir(parents=True, exist_ok=True) + json_path.write_text( + json.dumps(item, indent=2, default=str), encoding="utf-8" + ) + _process_pr_data(repo, item) + save_pr_raw_source(owner, repo_name, item) + json_path.unlink() + pr_numbers.append(pr_number) + count_prs += 1 + else: + issue_number = (item.get("issue_info") or {}).get("number") or item.get( + "number" + ) + if issue_number is None: + continue + json_path = get_issue_json_path(owner, repo_name, issue_number) + json_path.parent.mkdir(parents=True, exist_ok=True) + json_path.write_text( + json.dumps(item, indent=2, default=str), encoding="utf-8" + ) + _process_issue_data(repo, item) + save_issue_raw_source(owner, repo_name, item) + json_path.unlink() + issue_numbers.append(issue_number) + count_issues += 1 + + logger.info( + "sync_issues_and_prs: finished for repo id=%s; " + "%s existing issues + %s fetched; %s existing PRs + %s fetched", + repo.pk, + n_issues, + count_issues, + n_prs, + count_prs, + ) + + except (RateLimitException, ConnectionException) as e: + logger.error("sync_issues_and_prs: failed for repo id=%s: %s", repo.pk, e) + raise + except Exception as e: + logger.exception( + "sync_issues_and_prs: unexpected error for repo id=%s: %s", repo.pk, e + ) + raise + + return {"issues": issue_numbers, "pull_requests": pr_numbers} diff --git a/github_activity_tracker/sync/pull_requests.py b/github_activity_tracker/sync/pull_requests.py deleted file mode 100644 index b9a81d5..0000000 --- a/github_activity_tracker/sync/pull_requests.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -Sync GitHub pull requests (reviews, comments, assignees, labels) with the database. - -Flow: -1. Process existing JSON files in workspace///prs/*.json (load → DB → remove file). -2. Fetch from GitHub, save each as prs/.json, persist to DB, then remove the file. -""" - -from __future__ import annotations - -import json -import logging -from datetime import datetime, timedelta -from typing import TYPE_CHECKING, Optional - -from cppa_user_tracker.services import get_or_create_github_account -from github_activity_tracker import fetcher, services -from .raw_source import save_pr_raw_source -from .etag_cache import RedisListETagCache -from github_activity_tracker.workspace import ( - get_pr_json_path, - iter_existing_pr_jsons, -) -from github_ops import get_github_client -from github_ops.client import ConnectionException, RateLimitException -from github_activity_tracker.sync.utils import ( - normalize_pr_json, - parse_datetime, - parse_github_user, -) - -if TYPE_CHECKING: - from github_activity_tracker.models import GitHubRepository - -logger = logging.getLogger(__name__) - - -def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: - """Apply one PR dict (with comments, reviews, assignees, labels) to the database. - Accepts flat or nested { pr_info, comments, reviews } format.""" - pr_data = normalize_pr_json(pr_data) - user_info = parse_github_user(pr_data.get("user")) - if not user_info["account_id"]: - logger.warning( - "PR #%s: no user account_id; skipping", - pr_data.get("number", "?"), - ) - return - account, _ = get_or_create_github_account( - github_account_id=user_info["account_id"], - username=user_info["username"], - display_name=user_info["display_name"], - avatar_url=user_info["avatar_url"], - ) - - pr_obj, _ = services.create_or_update_pull_request( - repo=repo, - account=account, - pr_number=pr_data.get("number"), - pr_id=pr_data.get("id"), - title=pr_data.get("title", ""), - body=pr_data.get("body", ""), - state=pr_data.get("state", "open"), - head_hash=pr_data.get("head", {}).get("sha", ""), - base_hash=pr_data.get("base", {}).get("sha", ""), - pr_created_at=parse_datetime(pr_data.get("created_at")), - pr_updated_at=parse_datetime(pr_data.get("updated_at")), - pr_merged_at=parse_datetime(pr_data.get("merged_at")), - pr_closed_at=parse_datetime(pr_data.get("closed_at")), - ) - - for comment_data in pr_data.get("comments", []): - comment_user_info = parse_github_user(comment_data.get("user")) - if comment_user_info["account_id"]: - comment_account, _ = get_or_create_github_account( - github_account_id=comment_user_info["account_id"], - username=comment_user_info["username"], - display_name=comment_user_info["display_name"], - avatar_url=comment_user_info["avatar_url"], - ) - services.create_or_update_pr_comment( - pr=pr_obj, - account=comment_account, - pr_comment_id=comment_data.get("id"), - body=comment_data.get("body", ""), - pr_comment_created_at=parse_datetime(comment_data.get("created_at")), - pr_comment_updated_at=parse_datetime(comment_data.get("updated_at")), - ) - - for review_data in pr_data.get("reviews", []): - review_user_info = parse_github_user(review_data.get("user")) - if review_user_info["account_id"]: - review_account, _ = get_or_create_github_account( - github_account_id=review_user_info["account_id"], - username=review_user_info["username"], - display_name=review_user_info["display_name"], - avatar_url=review_user_info["avatar_url"], - ) - services.create_or_update_pr_review( - pr=pr_obj, - account=review_account, - pr_review_id=review_data.get("id"), - body=review_data.get("body", ""), - in_reply_to_id=review_data.get("in_reply_to_id"), - pr_review_created_at=parse_datetime(review_data.get("created_at")), - pr_review_updated_at=parse_datetime(review_data.get("updated_at")), - ) - - for assignee_data in pr_data.get("assignees", []): - assignee_info = parse_github_user(assignee_data) - if assignee_info["account_id"]: - assignee_account, _ = get_or_create_github_account( - github_account_id=assignee_info["account_id"], - username=assignee_info["username"], - display_name=assignee_info["display_name"], - avatar_url=assignee_info["avatar_url"], - ) - services.add_pr_assignee(pr_obj, assignee_account) - - for label_data in pr_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_pull_request_label(pr_obj, label_name) - - logger.debug("PR #%s: saved to DB", pr_data.get("number")) - - -def _process_existing_pr_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: - """Load each prs/*.json in workspace for this repo, save to DB, remove file. - - Returns: - (count, pr_numbers) — count of processed files and their PR numbers. - """ - owner = repo.owner_account.username - repo_name = repo.repo_name - count = 0 - numbers: list[int] = [] - for path in iter_existing_pr_jsons(owner, repo_name): - try: - data = json.loads(path.read_text(encoding="utf-8")) - _process_pr_data(repo, data) - save_pr_raw_source(owner, repo_name, data) - path.unlink() - number = (data.get("pr_info") or {}).get("number") or data.get("number") - if number is not None: - numbers.append(number) - count += 1 - except Exception as e: - logger.exception("Failed to process %s: %s", path, e) - return count, numbers - - -def sync_pull_requests( - repo: GitHubRepository, - start_date: Optional[datetime] = None, - end_date: Optional[datetime] = None, -) -> list[int]: - """1) Process existing workspace JSONs; 2) Fetch from GitHub, save as JSON, persist to DB, remove file. - - Args: - repo: Repository to sync. - start_date: Override start date (default: last PR updated_at + 1s, or None if no PRs). - end_date: Override end date (default: None = no end; stable ETag cache). - - Returns: - List of PR numbers processed during this sync run. - """ - logger.info( - "sync_pull_requests: starting for repo id=%s (%s)", - repo.pk, - repo.repo_name, - ) - - owner = repo.owner_account.username - repo_name = repo.repo_name - processed_numbers: list[int] = [] - - try: - # Phase 1: process existing JSON files - n_existing, existing_numbers = _process_existing_pr_jsons(repo) - processed_numbers.extend(existing_numbers) - if n_existing: - logger.info( - "sync_pull_requests: processed %s existing PR JSON(s)", - n_existing, - ) - - # Phase 2: fetch from GitHub, write JSON, persist to DB, remove file - client = get_github_client() - if start_date is None: - last_pr = repo.pull_requests.order_by("-pr_updated_at").first() - if last_pr: - start_date = last_pr.pr_updated_at + timedelta(seconds=1) - # Leave end_date as None when not set so ETag cache semantics stay stable. - - count = 0 - etag_cache = RedisListETagCache(repo_id=repo.pk) - for pr_data in fetcher.fetch_pull_requests_from_github( - client, owner, repo_name, start_date, end_date, etag_cache=etag_cache - ): - pr_number = (pr_data.get("pr_info") or {}).get("number") or pr_data.get( - "number" - ) - if pr_number is None: - continue - json_path = get_pr_json_path(owner, repo_name, pr_number) - json_path.parent.mkdir(parents=True, exist_ok=True) - json_path.write_text( - json.dumps(pr_data, indent=2, default=str), encoding="utf-8" - ) - _process_pr_data(repo, pr_data) - save_pr_raw_source(owner, repo_name, pr_data) - json_path.unlink() - processed_numbers.append(pr_number) - count += 1 - - logger.info( - "sync_pull_requests: finished for repo id=%s; %s existing + %s fetched", - repo.pk, - n_existing, - count, - ) - - except (RateLimitException, ConnectionException) as e: - logger.error("sync_pull_requests: failed for repo id=%s: %s", repo.pk, e) - raise - except Exception as e: - logger.exception( - "sync_pull_requests: unexpected error for repo id=%s: %s", - repo.pk, - e, - ) - raise - - return processed_numbers diff --git a/github_activity_tracker/tests/test_client_link_parsing.py b/github_activity_tracker/tests/test_client_link_parsing.py new file mode 100644 index 0000000..fd10c36 --- /dev/null +++ b/github_activity_tracker/tests/test_client_link_parsing.py @@ -0,0 +1,57 @@ +"""Tests for GitHubAPIClient Link header parsing methods.""" + +from github_ops.client import GitHubAPIClient + + +def test_parse_link_rels_parses_all_rels(): + """_parse_link_rels returns dict with all rel→url pairs from Link header.""" + link_header = ( + '; rel="next", ' + '; rel="last", ' + '; rel="first"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "next": "https://api.github.com/repos/o/r/commits?page=2", + "last": "https://api.github.com/repos/o/r/commits?page=50", + "first": "https://api.github.com/repos/o/r/commits?page=1", + } + + +def test_parse_link_rels_handles_prev_rel(): + """_parse_link_rels includes prev rel when present.""" + link_header = ( + '; rel="prev", ' + '; rel="first"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "prev": "https://api.github.com/repos/o/r/commits?page=49", + "first": "https://api.github.com/repos/o/r/commits?page=1", + } + + +def test_parse_link_rels_returns_empty_dict_when_no_header(): + """_parse_link_rels returns empty dict when Link header is None or empty.""" + assert GitHubAPIClient._parse_link_rels(None) == {} + assert GitHubAPIClient._parse_link_rels("") == {} + + +def test_parse_link_rels_handles_single_rel(): + """_parse_link_rels works with a single rel in the header.""" + link_header = '; rel="next"' + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == {"next": "https://api.github.com/repos/o/r/commits?page=2"} + + +def test_parse_link_rels_handles_github_repository_id_format(): + """_parse_link_rels handles GitHub's /repositories/{id}/commits format.""" + link_header = ( + '; rel="first", ' + '; rel="prev"' + ) + result = GitHubAPIClient._parse_link_rels(link_header) + assert result == { + "first": "https://api.github.com/repositories/7590028/commits?per_page=100&page=1", + "prev": "https://api.github.com/repositories/7590028/commits?per_page=100&page=522", + } diff --git a/github_activity_tracker/tests/test_fetcher.py b/github_activity_tracker/tests/test_fetcher.py index 569fb31..47fc87f 100644 --- a/github_activity_tracker/tests/test_fetcher.py +++ b/github_activity_tracker/tests/test_fetcher.py @@ -2,7 +2,7 @@ import pytest from datetime import datetime, timezone -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock from github_activity_tracker.fetcher import ( fetch_comments_from_github, @@ -71,19 +71,16 @@ def test_fetch_user_from_github_returns_none_when_empty_response(): def test_fetch_commits_from_github_yields_commit_dicts(): """fetch_commits_from_github yields full commit dict from /repos/.../commits/{sha}.""" client = MagicMock() - client.rest_request.side_effect = [ - [ - { - "sha": "abc", - "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}, - } - ], - { - "sha": "abc", - "commit": {"message": "msg"}, - "stats": {"additions": 1}, - }, - ] + # New API: rest_request_with_all_links returns (data, links_dict) + client.rest_request_with_all_links.return_value = ( + [{"sha": "abc", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {}, # No links = single page + ) + client.rest_request.return_value = { + "sha": "abc", + "commit": {"message": "msg"}, + "stats": {"additions": 1}, + } items = list(fetch_commits_from_github(client, "o", "r")) assert len(items) == 1 assert items[0]["sha"] == "abc" @@ -93,84 +90,71 @@ def test_fetch_commits_from_github_yields_commit_dicts(): def test_fetch_commits_from_github_stops_on_empty_page(): """fetch_commits_from_github stops when API returns empty list.""" client = MagicMock() - client.rest_request.return_value = [] + client.rest_request_with_all_links.return_value = ([], {}) items = list(fetch_commits_from_github(client, "owner", "repo")) assert items == [] - client.rest_request.assert_called_once() def test_fetch_commits_from_github_includes_since_until_params(): """fetch_commits_from_github passes since/until when start_time/end_time given.""" client = MagicMock() - client.rest_request.return_value = [] + client.rest_request_with_all_links.return_value = ([], {}) start = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 12, 31, tzinfo=timezone.utc) list(fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end)) - call_args = client.rest_request.call_args - params = call_args[0][1] or {} + call_args = client.rest_request_with_all_links.call_args + # params is the second positional argument + params = call_args[0][1] if len(call_args[0]) > 1 else call_args[1]["params"] assert "since" in params assert "until" in params def test_fetch_commits_from_github_with_etag_cache_304_yields_nothing(): - """When etag_cache is passed and rest_request_conditional returns 304, page is skipped - and next page is requested; when next page returns empty, no items yielded and set not called. - """ + """When etag_cache is passed and rest_request_conditional_with_all_links returns 304, nothing is yielded.""" client = MagicMock() - # Page 1: 304 -> skip; page 2: empty -> break. No items, no etag_cache.set. - client.rest_request_conditional.side_effect = [ - (None, 'W/"cached"'), # page 1: 304 - ([], None), # page 2: empty, stops loop - ] + # Page 1: 304 -> return immediately (new behavior) + client.rest_request_conditional_with_all_links.return_value = ( + None, + 'W/"cached"', + {}, + ) etag_cache = MagicMock() etag_cache.get.return_value = 'W/"cached"' - with patch("github_activity_tracker.fetcher.time.sleep"): - items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + assert items == [] - assert client.rest_request_conditional.call_count == 2 - # Ensure we requested page 1 then page 2 (no re-requesting the same page). - call1_params = client.rest_request_conditional.call_args_list[0][1]["params"] - call2_params = client.rest_request_conditional.call_args_list[1][1]["params"] - assert call1_params["page"] == 1 - assert call2_params["page"] == 2 + client.rest_request_conditional_with_all_links.assert_called_once() etag_cache.set.assert_not_called() def test_fetch_commits_from_github_with_etag_cache_200_yields_and_sets(): - """When etag_cache is passed and rest_request_conditional returns 200, yields items and calls set - only after the page's items have been consumed (checkpoint deferred). - """ + """When etag_cache is passed and rest_request_conditional_with_all_links returns 200, yields items and calls set.""" client = MagicMock() - # Two items on page 1 so we can assert set() is not called until after both are consumed. - client.rest_request_conditional.side_effect = [ - ( - [ - {"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}, - {"sha": "def", "commit": {"author": {"date": "2024-06-02T00:00:00Z"}}}, - ], - "W/new_etag", - ), - ] + # Single page with two commits (newest first from API, yielded oldest first) + client.rest_request_conditional_with_all_links.return_value = ( + [ + {"sha": "def", "commit": {"author": {"date": "2024-06-02T00:00:00Z"}}}, + {"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}, + ], + "W/new_etag", + {}, # No links = single page + ) client.rest_request.side_effect = [ {"sha": "abc", "commit": {"message": "msg"}, "stats": {"additions": 1}}, {"sha": "def", "commit": {"message": "msg2"}, "stats": {"additions": 2}}, ] etag_cache = MagicMock() etag_cache.get.return_value = None - with patch("github_activity_tracker.fetcher.time.sleep"): - gen = fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache) - # Consume first item only; checkpoint must not be written yet. - first = next(gen) - etag_cache.set.assert_not_called() - # Consume second item; set still not called until we advance past the last yield. - second = next(gen) - etag_cache.set.assert_not_called() - # Advancing again runs the code after the for-loop (etag_cache.set) then exits. - with pytest.raises(StopIteration): - next(gen) - etag_cache.set.assert_called_once() - assert first["sha"] == "abc" - assert second["sha"] == "def" + + items = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + # Should yield oldest first: abc, def + assert len(items) == 2 + assert items[0]["sha"] == "abc" + assert items[1]["sha"] == "def" + # ETag should be cached after processing + etag_cache.set.assert_called_once() call_args = etag_cache.set.call_args[0] assert call_args[0] == "commits" assert call_args[1] == 1 @@ -182,21 +166,18 @@ def test_fetch_commits_from_github_aborts_on_502_503_504(): import requests as req client = MagicMock() - # API returns commits (e.g. newest first); fetcher iterates reversed(), so first - # full-commit fetch is for the last in this list (def456). That fetch returns 502 → abort. - client.rest_request.side_effect = [ + # Single page with commits (newest first from API) + client.rest_request_with_all_links.return_value = ( [ - { - "sha": "abc123", - "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}, - }, - { - "sha": "def456", - "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}, - }, + {"sha": "def456", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "abc123", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, ], - req.exceptions.HTTPError("Bad Gateway", response=MagicMock(status_code=502)), - ] + {}, + ) + # First detail fetch (for abc123, oldest) returns 502 + client.rest_request.side_effect = req.exceptions.HTTPError( + "Bad Gateway", response=MagicMock(status_code=502) + ) with pytest.raises(req.exceptions.HTTPError): list(fetch_commits_from_github(client, "o", "r")) @@ -206,20 +187,19 @@ def test_fetch_commits_from_github_5xx_with_etag_cache_does_not_checkpoint(): import requests as req client = MagicMock() - client.rest_request_conditional.side_effect = [ - ( - [{"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}], - "W/new_etag", - ), - ] + client.rest_request_conditional_with_all_links.return_value = ( + [{"sha": "abc", "commit": {"author": {"date": "2024-06-01T00:00:00Z"}}}], + "W/new_etag", + {}, + ) client.rest_request.side_effect = req.exceptions.HTTPError( "Bad Gateway", response=MagicMock(status_code=502) ) etag_cache = MagicMock() etag_cache.get.return_value = None - with patch("github_activity_tracker.fetcher.time.sleep"): - with pytest.raises(req.exceptions.HTTPError): - list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + with pytest.raises(req.exceptions.HTTPError): + list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) etag_cache.set.assert_not_called() @@ -228,10 +208,13 @@ def test_fetch_commits_from_github_reraises_non_server_error_http(): import requests as req client = MagicMock() - client.rest_request.side_effect = [ + client.rest_request_with_all_links.return_value = ( [{"sha": "abc", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], - req.exceptions.HTTPError("Forbidden", response=MagicMock(status_code=403)), - ] + {}, + ) + client.rest_request.side_effect = req.exceptions.HTTPError( + "Forbidden", response=MagicMock(status_code=403) + ) with pytest.raises(req.exceptions.HTTPError): list(fetch_commits_from_github(client, "o", "r")) diff --git a/github_activity_tracker/tests/test_fetcher_commits_backward.py b/github_activity_tracker/tests/test_fetcher_commits_backward.py new file mode 100644 index 0000000..3fce6f5 --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_commits_backward.py @@ -0,0 +1,129 @@ +"""Tests for fetch_commits_from_github backward pagination (oldest→newest).""" + +import pytest +from datetime import datetime, timezone +from unittest.mock import MagicMock, call + +from github_activity_tracker.fetcher import fetch_commits_from_github + + +def test_fetch_commits_single_page_yields_oldest_first(): + """fetch_commits_from_github with single page yields commits in reverse (oldest first).""" + client = MagicMock() + # Page 1 has no "last" link (single page) + client.rest_request_with_all_links.return_value = ( + [ + {"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}, + {"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, + ], + {}, # No links = single page + ) + client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield oldest→newest: c1, c2, c3 + assert [c["sha"] for c in commits] == ["c1", "c2", "c3"] + + +def test_fetch_commits_multiple_pages_backward_traversal(): + """fetch_commits_from_github walks backward from last page to first.""" + client = MagicMock() + + # Page 1: has "last" link pointing to page 3 + client.rest_request_with_all_links.return_value = ( + [{"sha": "c9", "commit": {"author": {"date": "2024-01-09T00:00:00Z"}}}], + {"next": "https://api.github.com/repos/o/r/commits?page=2", "last": "https://api.github.com/repos/o/r/commits?page=3"}, + ) + + # Page 3 (last): has "prev" pointing to page 2 + # Page 2: has "prev" pointing to page 1 + client.rest_request_url_with_all_links.side_effect = [ + # Page 3 + ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {"prev": "https://api.github.com/repos/o/r/commits?page=2", "first": "https://api.github.com/repos/o/r/commits?page=1"}, + ), + # Page 2 + ( + [{"sha": "c5", "commit": {"author": {"date": "2024-01-05T00:00:00Z"}}}], + {"prev": "https://api.github.com/repos/o/r/commits?page=1", "first": "https://api.github.com/repos/o/r/commits?page=1"}, + ), + # Page 1 is cached, not fetched again + ] + + client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield oldest→newest: c1 (page 3), c5 (page 2), c9 (page 1 cached) + assert [c["sha"] for c in commits] == ["c1", "c5", "c9"] + # Page 1 should NOT be fetched again via rest_request_url_with_all_links + assert client.rest_request_url_with_all_links.call_count == 2 + + +def test_fetch_commits_caches_first_page(): + """fetch_commits_from_github reuses cached page 1 data when prev returns to page 1.""" + client = MagicMock() + + # Page 1 + page1_data = [ + {"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}, + ] + client.rest_request_with_all_links.return_value = ( + page1_data, + {"last": "https://api.github.com/repos/o/r/commits?page=2"}, + ) + + # Page 2 (last): prev points back to page 1 + client.rest_request_url_with_all_links.return_value = ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {"prev": "https://api.github.com/repos/o/r/commits?page=1"}, + ) + + client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + # Should yield c1 (page 2), c3 (page 1 from cache) + assert [c["sha"] for c in commits] == ["c1", "c3"] + # rest_request_url_with_all_links called only once for page 2 + assert client.rest_request_url_with_all_links.call_count == 1 + + +def test_fetch_commits_filters_by_date_range(): + """fetch_commits_from_github filters commits outside start_time/end_time.""" + client = MagicMock() + client.rest_request_with_all_links.return_value = ( + [ + {"sha": "c4", "commit": {"author": {"date": "2024-01-04T00:00:00Z"}}}, + {"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}, + {"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}, + ], + {}, + ) + client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + start = datetime(2024, 1, 2, tzinfo=timezone.utc) + end = datetime(2024, 1, 3, tzinfo=timezone.utc) + commits = list(fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end)) + + # Only c2 is in range [2024-01-02, 2024-01-03] + assert [c["sha"] for c in commits] == ["c2"] + + +def test_fetch_commits_handles_304_not_modified(): + """fetch_commits_from_github returns immediately on 304 when using etag_cache.""" + client = MagicMock() + etag_cache = MagicMock() + etag_cache.get.return_value = "abc123" + + # 304 response: data is None + client.rest_request_conditional_with_all_links.return_value = (None, "abc123", {}) + + commits = list(fetch_commits_from_github(client, "o", "r", etag_cache=etag_cache)) + + assert commits == [] + # Should not attempt to paginate + client.rest_request_url_with_all_links.assert_not_called() diff --git a/github_activity_tracker/tests/test_fetcher_date_helpers.py b/github_activity_tracker/tests/test_fetcher_date_helpers.py new file mode 100644 index 0000000..836425d --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_date_helpers.py @@ -0,0 +1,106 @@ +"""Tests for fetcher date range helper functions.""" + +from datetime import datetime, timezone + +from github_activity_tracker.fetcher import _make_aware, _in_date_range + + +def test_make_aware_converts_naive_to_utc(): + """_make_aware converts naive datetime to UTC-aware.""" + naive = datetime(2024, 1, 1, 12, 0, 0) + result = _make_aware(naive) + assert result.tzinfo == timezone.utc + assert result.year == 2024 + assert result.month == 1 + assert result.day == 1 + + +def test_make_aware_preserves_aware_datetime(): + """_make_aware returns aware datetime as-is.""" + aware = datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + result = _make_aware(aware) + assert result is aware + + +def test_make_aware_preserves_non_utc_aware_datetime(): + """_make_aware returns non-UTC aware datetime as-is (does not convert to UTC).""" + from datetime import timedelta + + # Create a datetime in UTC+5 + utc_plus_5 = timezone(timedelta(hours=5)) + dt = datetime(2024, 1, 1, 12, 0, 0, tzinfo=utc_plus_5) + result = _make_aware(dt) + + # _make_aware returns aware datetimes as-is; it doesn't convert to UTC + assert result is dt + assert result.tzinfo == utc_plus_5 + + +def test_in_date_range_returns_true_when_in_range(): + """_in_date_range returns True when dt is within [start_time, end_time].""" + dt = datetime(2024, 1, 5, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is True + + +def test_in_date_range_returns_false_when_before_start(): + """_in_date_range returns False when dt is before start_time.""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + start = datetime(2024, 1, 5, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is False + + +def test_in_date_range_returns_false_when_after_end(): + """_in_date_range returns False when dt is after end_time.""" + dt = datetime(2024, 1, 15, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, end) is False + + +def test_in_date_range_returns_true_when_no_start_time(): + """_in_date_range returns True when start_time is None (no lower bound).""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + assert _in_date_range(dt, None, end) is True + + +def test_in_date_range_returns_true_when_no_end_time(): + """_in_date_range returns True when end_time is None (no upper bound).""" + dt = datetime(2024, 1, 15, tzinfo=timezone.utc) + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + + assert _in_date_range(dt, start, None) is True + + +def test_in_date_range_returns_true_when_no_bounds(): + """_in_date_range returns True when both start_time and end_time are None.""" + dt = datetime(2024, 1, 1, tzinfo=timezone.utc) + + assert _in_date_range(dt, None, None) is True + + +def test_in_date_range_handles_naive_start_and_end(): + """_in_date_range handles naive start_time and end_time by assuming UTC.""" + dt = datetime(2024, 1, 5, tzinfo=timezone.utc) + start = datetime(2024, 1, 1) # Naive + end = datetime(2024, 1, 10) # Naive + + assert _in_date_range(dt, start, end) is True + + +def test_in_date_range_inclusive_boundaries(): + """_in_date_range is inclusive on both boundaries.""" + start = datetime(2024, 1, 1, tzinfo=timezone.utc) + end = datetime(2024, 1, 10, tzinfo=timezone.utc) + + # Exactly at start + assert _in_date_range(start, start, end) is True + # Exactly at end + assert _in_date_range(end, start, end) is True diff --git a/github_activity_tracker/tests/test_fetcher_issues_and_prs.py b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py new file mode 100644 index 0000000..d140b83 --- /dev/null +++ b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py @@ -0,0 +1,175 @@ +"""Tests for fetch_issues_and_prs_from_github unified fetcher.""" + +import pytest +from datetime import datetime, timezone +from unittest.mock import MagicMock, call + +from github_activity_tracker.fetcher import fetch_issues_and_prs_from_github + + +def test_fetch_issues_and_prs_routes_issue_correctly(): + """fetch_issues_and_prs_from_github yields issue with issue_info key when no pull_request key.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + { + "number": 1, + "updated_at": "2024-01-01T00:00:00Z", + "title": "Bug", + } + ], + None, # No next page + ) + client.rest_request.side_effect = [ + {"number": 1, "title": "Bug", "body": "Full issue"}, # Full issue detail + [], # Comments + ] + + items = list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + assert len(items) == 1 + assert "issue_info" in items[0] + assert "pr_info" not in items[0] + assert items[0]["issue_info"]["number"] == 1 + assert items[0]["comments"] == [] + + +def test_fetch_issues_and_prs_routes_pr_correctly(): + """fetch_issues_and_prs_from_github yields PR with pr_info key when pull_request key present.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + { + "number": 2, + "updated_at": "2024-01-02T00:00:00Z", + "title": "Feature", + "pull_request": {"url": "https://api.github.com/repos/o/r/pulls/2"}, + } + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 2, "title": "Feature", "body": "Full PR"}, # Full PR detail + [], # Comments + [], # Reviews + ] + + items = list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + assert len(items) == 1 + assert "pr_info" in items[0] + assert "issue_info" not in items[0] + assert items[0]["pr_info"]["number"] == 2 + assert items[0]["comments"] == [] + assert items[0]["reviews"] == [] + + +def test_fetch_issues_and_prs_fetches_both_in_one_call(): + """fetch_issues_and_prs_from_github processes both issues and PRs from single /issues list.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + {"number": 1, "updated_at": "2024-01-01T00:00:00Z", "title": "Issue"}, + { + "number": 2, + "updated_at": "2024-01-02T00:00:00Z", + "title": "PR", + "pull_request": {"url": "..."}, + }, + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 1, "title": "Issue"}, # Issue detail + [], # Issue comments + {"number": 2, "title": "PR"}, # PR detail + [], # PR comments + [], # PR reviews + ] + + items = list(fetch_issues_and_prs_from_github(client, "o", "r")) + + assert len(items) == 2 + assert "issue_info" in items[0] + assert "pr_info" in items[1] + + +def test_fetch_issues_and_prs_uses_direction_asc(): + """fetch_issues_and_prs_from_github requests items with direction=asc (oldest first).""" + client = MagicMock() + client.rest_request_with_link.return_value = ([], None) + + list(fetch_issues_and_prs_from_github(client, "owner", "repo")) + + # Check the params argument (second positional arg) + call_args = client.rest_request_with_link.call_args + params = call_args[0][1] if len(call_args[0]) > 1 else call_args[1].get("params") + assert params["direction"] == "asc" + assert params["sort"] == "updated" + + +def test_fetch_issues_and_prs_filters_by_date_range(): + """fetch_issues_and_prs_from_github filters items outside start_time/end_time.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [ + {"number": 1, "updated_at": "2024-01-01T00:00:00Z"}, + {"number": 2, "updated_at": "2024-01-05T00:00:00Z"}, + {"number": 3, "updated_at": "2024-01-10T00:00:00Z"}, + ], + None, + ) + client.rest_request.side_effect = [ + {"number": 2}, # Only #2 in range + [], # Comments + ] + + start = datetime(2024, 1, 2, tzinfo=timezone.utc) + end = datetime(2024, 1, 8, tzinfo=timezone.utc) + items = list(fetch_issues_and_prs_from_github(client, "o", "r", start_time=start, end_time=end)) + + assert len(items) == 1 + assert items[0]["issue_info"]["number"] == 2 + + +def test_fetch_issues_and_prs_paginates_with_link_header(): + """fetch_issues_and_prs_from_github follows Link rel=next for pagination.""" + client = MagicMock() + client.rest_request_with_link.return_value = ( + [{"number": 1, "updated_at": "2024-01-01T00:00:00Z"}], + "https://api.github.com/page=2", + ) + client.rest_request_url.return_value = ( + [{"number": 2, "updated_at": "2024-01-02T00:00:00Z"}], + None, + ) + client.rest_request.side_effect = [ + {"number": 1}, + [], + {"number": 2}, + [], + ] + + items = list(fetch_issues_and_prs_from_github(client, "o", "r")) + + assert len(items) == 2 + client.rest_request_url.assert_called_once_with("https://api.github.com/page=2") + + +def test_fetch_issues_and_prs_handles_304_not_modified(): + """fetch_issues_and_prs_from_github skips page on 304 when using etag_cache.""" + client = MagicMock() + etag_cache = MagicMock() + etag_cache.get.return_value = "etag123" + + # First page: 304, second page: empty (end of pagination) + client.rest_request_conditional_with_link.side_effect = [ + (None, "etag123", None), # Page 1: 304 + ([], "new_etag", None), # Page 2: empty list (stops pagination) + ] + + items = list(fetch_issues_and_prs_from_github(client, "o", "r", etag_cache=etag_cache)) + + assert items == [] + # Should have tried page 1 (304) and page 2 (empty) + assert client.rest_request_conditional_with_link.call_count == 2 diff --git a/github_activity_tracker/tests/test_sync.py b/github_activity_tracker/tests/test_sync.py index a79b847..55ec71b 100644 --- a/github_activity_tracker/tests/test_sync.py +++ b/github_activity_tracker/tests/test_sync.py @@ -7,35 +7,47 @@ def test_sync_github_passes_start_date_end_date_to_sync_modules(): - """sync_github forwards start_date and end_date to sync_commits, sync_issues, sync_pull_requests.""" + """sync_github forwards start_date and end_date to sync_commits and sync_issues_and_prs.""" mock_repo = MagicMock() start = datetime(2024, 1, 1) end = datetime(2024, 12, 31) with patch("github_activity_tracker.sync.sync_repos") as m_repos, patch( "github_activity_tracker.sync.sync_commits" ) as m_commits, patch( - "github_activity_tracker.sync.sync_issues" - ) as m_issues, patch( - "github_activity_tracker.sync.sync_pull_requests" - ) as m_prs: - sync_github(mock_repo, start_date=start, end_date=end) + "github_activity_tracker.sync.sync_issues_and_prs" + ) as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [], "pull_requests": []} + result = sync_github(mock_repo, start_date=start, end_date=end) + m_repos.assert_called_once_with(mock_repo) m_commits.assert_called_once_with(mock_repo, start_date=start, end_date=end) - m_issues.assert_called_once_with(mock_repo, start_date=start, end_date=end) - m_prs.assert_called_once_with(mock_repo, start_date=start, end_date=end) + m_issues_and_prs.assert_called_once_with(mock_repo, start_date=start, end_date=end) + assert result == {"issues": [], "pull_requests": []} def test_sync_github_calls_sync_without_dates_when_none(): - """sync_github calls sync_commits/issues/pull_requests with start_date and end_date None when not provided.""" + """sync_github calls sync_commits and sync_issues_and_prs with start_date and end_date None when not provided.""" mock_repo = MagicMock() with patch("github_activity_tracker.sync.sync_repos"), patch( "github_activity_tracker.sync.sync_commits" ) as m_commits, patch( - "github_activity_tracker.sync.sync_issues" - ) as m_issues, patch( - "github_activity_tracker.sync.sync_pull_requests" - ) as m_prs: - sync_github(mock_repo) + "github_activity_tracker.sync.sync_issues_and_prs" + ) as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [1, 2], "pull_requests": [3]} + result = sync_github(mock_repo) + m_commits.assert_called_once_with(mock_repo, start_date=None, end_date=None) - m_issues.assert_called_once_with(mock_repo, start_date=None, end_date=None) - m_prs.assert_called_once_with(mock_repo, start_date=None, end_date=None) + m_issues_and_prs.assert_called_once_with(mock_repo, start_date=None, end_date=None) + assert result == {"issues": [1, 2], "pull_requests": [3]} + + +def test_sync_github_returns_issues_and_prs_dict(): + """sync_github returns dict with issues and pull_requests keys from sync_issues_and_prs.""" + mock_repo = MagicMock() + with patch("github_activity_tracker.sync.sync_repos"), patch( + "github_activity_tracker.sync.sync_commits" + ), patch("github_activity_tracker.sync.sync_issues_and_prs") as m_issues_and_prs: + m_issues_and_prs.return_value = {"issues": [10, 20], "pull_requests": [30, 40]} + result = sync_github(mock_repo) + + assert result == {"issues": [10, 20], "pull_requests": [30, 40]} diff --git a/github_activity_tracker/tests/test_sync_issues_and_prs.py b/github_activity_tracker/tests/test_sync_issues_and_prs.py new file mode 100644 index 0000000..f16aaf6 --- /dev/null +++ b/github_activity_tracker/tests/test_sync_issues_and_prs.py @@ -0,0 +1,193 @@ +"""Tests for sync_issues_and_prs unified sync function.""" + +import pytest +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch, call + +from github_activity_tracker.sync.issues_and_prs import ( + sync_issues_and_prs, + _process_issue_data, + _process_pr_data, + _process_existing_issue_jsons, + _process_existing_pr_jsons, +) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_processes_both_types( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs routes items by key to issue or PR processing.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + + # Yield one issue and one PR + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ + {"issue_info": {"number": 1}, "comments": []}, + {"pr_info": {"number": 2}, "comments": [], "reviews": []}, + ] + + with patch("github_activity_tracker.sync.issues_and_prs._process_issue_data") as mock_proc_issue, \ + patch("github_activity_tracker.sync.issues_and_prs._process_pr_data") as mock_proc_pr, \ + patch("github_activity_tracker.sync.issues_and_prs.save_issue_raw_source"), \ + patch("github_activity_tracker.sync.issues_and_prs.save_pr_raw_source"), \ + patch("github_activity_tracker.sync.issues_and_prs.get_issue_json_path") as mock_issue_path, \ + patch("github_activity_tracker.sync.issues_and_prs.get_pr_json_path") as mock_pr_path: + + mock_issue_path.return_value = MagicMock() + mock_pr_path.return_value = MagicMock() + + result = sync_issues_and_prs(mock_repo) + + assert result == {"issues": [1], "pull_requests": [2]} + mock_proc_issue.assert_called_once() + mock_proc_pr.assert_called_once() + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_uses_min_start_date( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs uses earliest of last_issue and last_pr updated_at as start_date.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + + # Last issue updated at 2024-01-05 + mock_last_issue = MagicMock() + mock_last_issue.issue_updated_at = datetime(2024, 1, 5, tzinfo=timezone.utc) + mock_repo.issues.order_by.return_value.first.return_value = mock_last_issue + + # Last PR updated at 2024-01-03 (earlier) + mock_last_pr = MagicMock() + mock_last_pr.pr_updated_at = datetime(2024, 1, 3, tzinfo=timezone.utc) + mock_repo.pull_requests.order_by.return_value.first.return_value = mock_last_pr + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + sync_issues_and_prs(mock_repo) + + # Should use 2024-01-03 + 1s (earliest) + call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args + start_date = call_args[0][3] # Fourth positional arg + assert start_date == datetime(2024, 1, 3, 0, 0, 1, tzinfo=timezone.utc) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_processes_existing_jsons_first( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs processes leftover JSON files before fetching from GitHub.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + # Existing JSONs found + mock_existing_issues.return_value = (2, [10, 11]) + mock_existing_prs.return_value = (1, [20]) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + result = sync_issues_and_prs(mock_repo) + + # Should include existing numbers in result + assert 10 in result["issues"] + assert 11 in result["issues"] + assert 20 in result["pull_requests"] + mock_existing_issues.assert_called_once_with(mock_repo) + mock_existing_prs.assert_called_once_with(mock_repo) + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_respects_override_start_date( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs uses provided start_date instead of deriving from DB.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] + + override_start = datetime(2023, 1, 1, tzinfo=timezone.utc) + sync_issues_and_prs(mock_repo, start_date=override_start) + + # Should NOT query DB for last issue/PR + mock_repo.issues.order_by.assert_not_called() + mock_repo.pull_requests.order_by.assert_not_called() + + # Should pass override_start to fetcher + call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args + assert call_args[0][3] == override_start + + +@patch("github_activity_tracker.sync.issues_and_prs.get_github_client") +@patch("github_activity_tracker.sync.issues_and_prs.fetcher") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") +@patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") +def test_sync_issues_and_prs_saves_and_removes_json_files( + mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client +): + """sync_issues_and_prs writes JSON, processes, then removes file for each item.""" + mock_repo = MagicMock() + mock_repo.owner_account.username = "owner" + mock_repo.repo_name = "repo" + mock_repo.issues.order_by.return_value.first.return_value = None + mock_repo.pull_requests.order_by.return_value.first.return_value = None + + mock_existing_issues.return_value = (0, []) + mock_existing_prs.return_value = (0, []) + + mock_client = MagicMock() + mock_get_client.return_value = mock_client + mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ + {"issue_info": {"number": 1}, "comments": []}, + ] + + mock_json_path = MagicMock() + + with patch("github_activity_tracker.sync.issues_and_prs._process_issue_data"), \ + patch("github_activity_tracker.sync.issues_and_prs.save_issue_raw_source"), \ + patch("github_activity_tracker.sync.issues_and_prs.get_issue_json_path", return_value=mock_json_path): + + sync_issues_and_prs(mock_repo) + + # Should write, then unlink + mock_json_path.parent.mkdir.assert_called_once() + mock_json_path.write_text.assert_called_once() + mock_json_path.unlink.assert_called_once() diff --git a/github_ops/client.py b/github_ops/client.py index 3a73a89..610a8fa 100644 --- a/github_ops/client.py +++ b/github_ops/client.py @@ -400,6 +400,21 @@ def rest_request_conditional_with_link( next_url = self._parse_link_next(response.headers.get("Link")) return (response.json(), response_etag, next_url) + def rest_request_conditional_with_all_links( + self, + endpoint: str, + params: Optional[dict] = None, + etag: Optional[str] = None, + ) -> tuple[Optional[Union[list, dict]], Optional[str], dict[str, str]]: + """Like rest_request_conditional but returns all Link rels as a dict. + Returns (data, response_etag, links_dict). On 304: (None, etag, {}). + """ + response, response_etag = self._rest_get(endpoint, params=params, etag=etag) + if response is None: + return (None, response_etag, {}) + links = self._parse_link_rels(response.headers.get("Link")) + return (response.json(), response_etag, links) + @staticmethod def _parse_link_next(link_header: Optional[str]) -> Optional[str]: """Parse GitHub Link response header; return URL for rel=\"next\" or None. @@ -410,6 +425,42 @@ def _parse_link_next(link_header: Optional[str]) -> Optional[str]: match = re.search(r'<([^>]+)>;\s*rel="next"', link_header) return match.group(1) if match else None + @staticmethod + def _parse_link_rels(link_header: Optional[str]) -> dict[str, str]: + """Parse GitHub Link response header; return a dict of all rel→url pairs. + Example: {"next": "https://...", "last": "https://...", "prev": "https://..."} + """ + if not link_header: + return {} + return { + rel: url + for url, rel in re.findall(r'<([^>]+)>;\s*rel="([^"]+)"', link_header) + } + + def rest_request_with_all_links( + self, endpoint: str, params: Optional[dict] = None + ) -> tuple[Union[list, dict], dict[str, str]]: + """GET request that returns (data, links_dict) with all Link rels. + links_dict keys include "next", "prev", "last", "first" when present. + """ + response, _ = self._rest_get(endpoint, params=params) + if response is None: + return ({}, {}) + data = response.json() + links = self._parse_link_rels(response.headers.get("Link")) + return (data, links) + + def rest_request_url_with_all_links( + self, full_url: str + ) -> tuple[Union[list, dict], dict[str, str]]: + """GET full_url (e.g. from Link header) and return (data, links_dict) with all rels. + Uses same session (auth, rate limit). For paginated backward/forward traversal. + """ + response = self._rest_get_url(full_url) + data = response.json() + links = self._parse_link_rels(response.headers.get("Link")) + return (data, links) + def rest_request_with_link( self, endpoint: str, params: Optional[dict] = None ) -> tuple[Union[list, dict], Optional[str]]: From 85e0d6a17b07d94365a5cfb8a76b3c3c1e5751b9 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 25 Mar 2026 02:09:12 -0400 Subject: [PATCH 28/76] Fix: lint/format error - #125 --- .../sync/issues_and_prs.py | 15 ++- .../tests/test_fetcher_commits_backward.py | 56 +++++++---- .../tests/test_fetcher_date_helpers.py | 20 ++-- .../tests/test_fetcher_issues_and_prs.py | 17 ++-- .../tests/test_sync_issues_and_prs.py | 95 ++++++++++--------- 5 files changed, 123 insertions(+), 80 deletions(-) diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py index 71f3810..2f95c8f 100644 --- a/github_activity_tracker/sync/issues_and_prs.py +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -17,7 +17,10 @@ from cppa_user_tracker.services import get_or_create_github_account from github_activity_tracker import fetcher, services from github_activity_tracker.sync.etag_cache import RedisListETagCache -from github_activity_tracker.sync.raw_source import save_issue_raw_source, save_pr_raw_source +from github_activity_tracker.sync.raw_source import ( + save_issue_raw_source, + save_pr_raw_source, +) from github_activity_tracker.sync.utils import ( normalize_issue_json, normalize_pr_json, @@ -300,8 +303,14 @@ def sync_issues_and_prs( last_issue = repo.issues.order_by("-issue_updated_at").first() last_pr = repo.pull_requests.order_by("-pr_updated_at").first() - issue_date = (last_issue.issue_updated_at + timedelta(seconds=1)) if last_issue else None - pr_date = (last_pr.pr_updated_at + timedelta(seconds=1)) if last_pr else None + issue_date = ( + (last_issue.issue_updated_at + timedelta(seconds=1)) + if last_issue + else None + ) + pr_date = ( + (last_pr.pr_updated_at + timedelta(seconds=1)) if last_pr else None + ) if issue_date and pr_date: start_date = min(issue_date, pr_date) diff --git a/github_activity_tracker/tests/test_fetcher_commits_backward.py b/github_activity_tracker/tests/test_fetcher_commits_backward.py index 3fce6f5..60cc9f8 100644 --- a/github_activity_tracker/tests/test_fetcher_commits_backward.py +++ b/github_activity_tracker/tests/test_fetcher_commits_backward.py @@ -1,8 +1,7 @@ """Tests for fetch_commits_from_github backward pagination (oldest→newest).""" -import pytest from datetime import datetime, timezone -from unittest.mock import MagicMock, call +from unittest.mock import MagicMock from github_activity_tracker.fetcher import fetch_commits_from_github @@ -19,7 +18,10 @@ def test_fetch_commits_single_page_yields_oldest_first(): ], {}, # No links = single page ) - client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } commits = list(fetch_commits_from_github(client, "owner", "repo")) @@ -30,30 +32,42 @@ def test_fetch_commits_single_page_yields_oldest_first(): def test_fetch_commits_multiple_pages_backward_traversal(): """fetch_commits_from_github walks backward from last page to first.""" client = MagicMock() - + # Page 1: has "last" link pointing to page 3 client.rest_request_with_all_links.return_value = ( [{"sha": "c9", "commit": {"author": {"date": "2024-01-09T00:00:00Z"}}}], - {"next": "https://api.github.com/repos/o/r/commits?page=2", "last": "https://api.github.com/repos/o/r/commits?page=3"}, + { + "next": "https://api.github.com/repos/o/r/commits?page=2", + "last": "https://api.github.com/repos/o/r/commits?page=3", + }, ) - + # Page 3 (last): has "prev" pointing to page 2 # Page 2: has "prev" pointing to page 1 client.rest_request_url_with_all_links.side_effect = [ # Page 3 ( [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], - {"prev": "https://api.github.com/repos/o/r/commits?page=2", "first": "https://api.github.com/repos/o/r/commits?page=1"}, + { + "prev": "https://api.github.com/repos/o/r/commits?page=2", + "first": "https://api.github.com/repos/o/r/commits?page=1", + }, ), # Page 2 ( [{"sha": "c5", "commit": {"author": {"date": "2024-01-05T00:00:00Z"}}}], - {"prev": "https://api.github.com/repos/o/r/commits?page=1", "first": "https://api.github.com/repos/o/r/commits?page=1"}, + { + "prev": "https://api.github.com/repos/o/r/commits?page=1", + "first": "https://api.github.com/repos/o/r/commits?page=1", + }, ), # Page 1 is cached, not fetched again ] - - client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } commits = list(fetch_commits_from_github(client, "owner", "repo")) @@ -66,7 +80,7 @@ def test_fetch_commits_multiple_pages_backward_traversal(): def test_fetch_commits_caches_first_page(): """fetch_commits_from_github reuses cached page 1 data when prev returns to page 1.""" client = MagicMock() - + # Page 1 page1_data = [ {"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}, @@ -75,14 +89,17 @@ def test_fetch_commits_caches_first_page(): page1_data, {"last": "https://api.github.com/repos/o/r/commits?page=2"}, ) - + # Page 2 (last): prev points back to page 1 client.rest_request_url_with_all_links.return_value = ( [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], {"prev": "https://api.github.com/repos/o/r/commits?page=1"}, ) - - client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } commits = list(fetch_commits_from_github(client, "owner", "repo")) @@ -103,11 +120,16 @@ def test_fetch_commits_filters_by_date_range(): ], {}, ) - client.rest_request.side_effect = lambda url: {"sha": url.split("/")[-1], "stats": {}} + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } start = datetime(2024, 1, 2, tzinfo=timezone.utc) end = datetime(2024, 1, 3, tzinfo=timezone.utc) - commits = list(fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end)) + commits = list( + fetch_commits_from_github(client, "o", "r", start_time=start, end_time=end) + ) # Only c2 is in range [2024-01-02, 2024-01-03] assert [c["sha"] for c in commits] == ["c2"] @@ -118,7 +140,7 @@ def test_fetch_commits_handles_304_not_modified(): client = MagicMock() etag_cache = MagicMock() etag_cache.get.return_value = "abc123" - + # 304 response: data is None client.rest_request_conditional_with_all_links.return_value = (None, "abc123", {}) diff --git a/github_activity_tracker/tests/test_fetcher_date_helpers.py b/github_activity_tracker/tests/test_fetcher_date_helpers.py index 836425d..0dbd638 100644 --- a/github_activity_tracker/tests/test_fetcher_date_helpers.py +++ b/github_activity_tracker/tests/test_fetcher_date_helpers.py @@ -25,12 +25,12 @@ def test_make_aware_preserves_aware_datetime(): def test_make_aware_preserves_non_utc_aware_datetime(): """_make_aware returns non-UTC aware datetime as-is (does not convert to UTC).""" from datetime import timedelta - + # Create a datetime in UTC+5 utc_plus_5 = timezone(timedelta(hours=5)) dt = datetime(2024, 1, 1, 12, 0, 0, tzinfo=utc_plus_5) result = _make_aware(dt) - + # _make_aware returns aware datetimes as-is; it doesn't convert to UTC assert result is dt assert result.tzinfo == utc_plus_5 @@ -41,7 +41,7 @@ def test_in_date_range_returns_true_when_in_range(): dt = datetime(2024, 1, 5, tzinfo=timezone.utc) start = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 1, 10, tzinfo=timezone.utc) - + assert _in_date_range(dt, start, end) is True @@ -50,7 +50,7 @@ def test_in_date_range_returns_false_when_before_start(): dt = datetime(2024, 1, 1, tzinfo=timezone.utc) start = datetime(2024, 1, 5, tzinfo=timezone.utc) end = datetime(2024, 1, 10, tzinfo=timezone.utc) - + assert _in_date_range(dt, start, end) is False @@ -59,7 +59,7 @@ def test_in_date_range_returns_false_when_after_end(): dt = datetime(2024, 1, 15, tzinfo=timezone.utc) start = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 1, 10, tzinfo=timezone.utc) - + assert _in_date_range(dt, start, end) is False @@ -67,7 +67,7 @@ def test_in_date_range_returns_true_when_no_start_time(): """_in_date_range returns True when start_time is None (no lower bound).""" dt = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 1, 10, tzinfo=timezone.utc) - + assert _in_date_range(dt, None, end) is True @@ -75,14 +75,14 @@ def test_in_date_range_returns_true_when_no_end_time(): """_in_date_range returns True when end_time is None (no upper bound).""" dt = datetime(2024, 1, 15, tzinfo=timezone.utc) start = datetime(2024, 1, 1, tzinfo=timezone.utc) - + assert _in_date_range(dt, start, None) is True def test_in_date_range_returns_true_when_no_bounds(): """_in_date_range returns True when both start_time and end_time are None.""" dt = datetime(2024, 1, 1, tzinfo=timezone.utc) - + assert _in_date_range(dt, None, None) is True @@ -91,7 +91,7 @@ def test_in_date_range_handles_naive_start_and_end(): dt = datetime(2024, 1, 5, tzinfo=timezone.utc) start = datetime(2024, 1, 1) # Naive end = datetime(2024, 1, 10) # Naive - + assert _in_date_range(dt, start, end) is True @@ -99,7 +99,7 @@ def test_in_date_range_inclusive_boundaries(): """_in_date_range is inclusive on both boundaries.""" start = datetime(2024, 1, 1, tzinfo=timezone.utc) end = datetime(2024, 1, 10, tzinfo=timezone.utc) - + # Exactly at start assert _in_date_range(start, start, end) is True # Exactly at end diff --git a/github_activity_tracker/tests/test_fetcher_issues_and_prs.py b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py index d140b83..f75be7e 100644 --- a/github_activity_tracker/tests/test_fetcher_issues_and_prs.py +++ b/github_activity_tracker/tests/test_fetcher_issues_and_prs.py @@ -1,8 +1,7 @@ """Tests for fetch_issues_and_prs_from_github unified fetcher.""" -import pytest from datetime import datetime, timezone -from unittest.mock import MagicMock, call +from unittest.mock import MagicMock from github_activity_tracker.fetcher import fetch_issues_and_prs_from_github @@ -126,7 +125,11 @@ def test_fetch_issues_and_prs_filters_by_date_range(): start = datetime(2024, 1, 2, tzinfo=timezone.utc) end = datetime(2024, 1, 8, tzinfo=timezone.utc) - items = list(fetch_issues_and_prs_from_github(client, "o", "r", start_time=start, end_time=end)) + items = list( + fetch_issues_and_prs_from_github( + client, "o", "r", start_time=start, end_time=end + ) + ) assert len(items) == 1 assert items[0]["issue_info"]["number"] == 2 @@ -161,14 +164,16 @@ def test_fetch_issues_and_prs_handles_304_not_modified(): client = MagicMock() etag_cache = MagicMock() etag_cache.get.return_value = "etag123" - + # First page: 304, second page: empty (end of pagination) client.rest_request_conditional_with_link.side_effect = [ (None, "etag123", None), # Page 1: 304 - ([], "new_etag", None), # Page 2: empty list (stops pagination) + ([], "new_etag", None), # Page 2: empty list (stops pagination) ] - items = list(fetch_issues_and_prs_from_github(client, "o", "r", etag_cache=etag_cache)) + items = list( + fetch_issues_and_prs_from_github(client, "o", "r", etag_cache=etag_cache) + ) assert items == [] # Should have tried page 1 (304) and page 2 (empty) diff --git a/github_activity_tracker/tests/test_sync_issues_and_prs.py b/github_activity_tracker/tests/test_sync_issues_and_prs.py index f16aaf6..ded6124 100644 --- a/github_activity_tracker/tests/test_sync_issues_and_prs.py +++ b/github_activity_tracker/tests/test_sync_issues_and_prs.py @@ -1,15 +1,10 @@ """Tests for sync_issues_and_prs unified sync function.""" -import pytest -from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock, patch, call +from datetime import datetime, timezone +from unittest.mock import MagicMock, patch from github_activity_tracker.sync.issues_and_prs import ( sync_issues_and_prs, - _process_issue_data, - _process_pr_data, - _process_existing_issue_jsons, - _process_existing_pr_jsons, ) @@ -26,31 +21,38 @@ def test_sync_issues_and_prs_processes_both_types( mock_repo.repo_name = "repo" mock_repo.issues.order_by.return_value.first.return_value = None mock_repo.pull_requests.order_by.return_value.first.return_value = None - + mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) - + mock_client = MagicMock() mock_get_client.return_value = mock_client - + # Yield one issue and one PR mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ {"issue_info": {"number": 1}, "comments": []}, {"pr_info": {"number": 2}, "comments": [], "reviews": []}, ] - - with patch("github_activity_tracker.sync.issues_and_prs._process_issue_data") as mock_proc_issue, \ - patch("github_activity_tracker.sync.issues_and_prs._process_pr_data") as mock_proc_pr, \ - patch("github_activity_tracker.sync.issues_and_prs.save_issue_raw_source"), \ - patch("github_activity_tracker.sync.issues_and_prs.save_pr_raw_source"), \ - patch("github_activity_tracker.sync.issues_and_prs.get_issue_json_path") as mock_issue_path, \ - patch("github_activity_tracker.sync.issues_and_prs.get_pr_json_path") as mock_pr_path: - + + with patch( + "github_activity_tracker.sync.issues_and_prs._process_issue_data" + ) as mock_proc_issue, patch( + "github_activity_tracker.sync.issues_and_prs._process_pr_data" + ) as mock_proc_pr, patch( + "github_activity_tracker.sync.issues_and_prs.save_issue_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.save_pr_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.get_issue_json_path" + ) as mock_issue_path, patch( + "github_activity_tracker.sync.issues_and_prs.get_pr_json_path" + ) as mock_pr_path: + mock_issue_path.return_value = MagicMock() mock_pr_path.return_value = MagicMock() - + result = sync_issues_and_prs(mock_repo) - + assert result == {"issues": [1], "pull_requests": [2]} mock_proc_issue.assert_called_once() mock_proc_pr.assert_called_once() @@ -67,26 +69,26 @@ def test_sync_issues_and_prs_uses_min_start_date( mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" - + # Last issue updated at 2024-01-05 mock_last_issue = MagicMock() mock_last_issue.issue_updated_at = datetime(2024, 1, 5, tzinfo=timezone.utc) mock_repo.issues.order_by.return_value.first.return_value = mock_last_issue - + # Last PR updated at 2024-01-03 (earlier) mock_last_pr = MagicMock() mock_last_pr.pr_updated_at = datetime(2024, 1, 3, tzinfo=timezone.utc) mock_repo.pull_requests.order_by.return_value.first.return_value = mock_last_pr - + mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) - + mock_client = MagicMock() mock_get_client.return_value = mock_client mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] - + sync_issues_and_prs(mock_repo) - + # Should use 2024-01-03 + 1s (earliest) call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args start_date = call_args[0][3] # Fourth positional arg @@ -106,17 +108,17 @@ def test_sync_issues_and_prs_processes_existing_jsons_first( mock_repo.repo_name = "repo" mock_repo.issues.order_by.return_value.first.return_value = None mock_repo.pull_requests.order_by.return_value.first.return_value = None - + # Existing JSONs found mock_existing_issues.return_value = (2, [10, 11]) mock_existing_prs.return_value = (1, [20]) - + mock_client = MagicMock() mock_get_client.return_value = mock_client mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] - + result = sync_issues_and_prs(mock_repo) - + # Should include existing numbers in result assert 10 in result["issues"] assert 11 in result["issues"] @@ -136,21 +138,21 @@ def test_sync_issues_and_prs_respects_override_start_date( mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" - + mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) - + mock_client = MagicMock() mock_get_client.return_value = mock_client mock_fetcher.fetch_issues_and_prs_from_github.return_value = [] - + override_start = datetime(2023, 1, 1, tzinfo=timezone.utc) sync_issues_and_prs(mock_repo, start_date=override_start) - + # Should NOT query DB for last issue/PR mock_repo.issues.order_by.assert_not_called() mock_repo.pull_requests.order_by.assert_not_called() - + # Should pass override_start to fetcher call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args assert call_args[0][3] == override_start @@ -169,24 +171,29 @@ def test_sync_issues_and_prs_saves_and_removes_json_files( mock_repo.repo_name = "repo" mock_repo.issues.order_by.return_value.first.return_value = None mock_repo.pull_requests.order_by.return_value.first.return_value = None - + mock_existing_issues.return_value = (0, []) mock_existing_prs.return_value = (0, []) - + mock_client = MagicMock() mock_get_client.return_value = mock_client mock_fetcher.fetch_issues_and_prs_from_github.return_value = [ {"issue_info": {"number": 1}, "comments": []}, ] - + mock_json_path = MagicMock() - - with patch("github_activity_tracker.sync.issues_and_prs._process_issue_data"), \ - patch("github_activity_tracker.sync.issues_and_prs.save_issue_raw_source"), \ - patch("github_activity_tracker.sync.issues_and_prs.get_issue_json_path", return_value=mock_json_path): - + + with patch( + "github_activity_tracker.sync.issues_and_prs._process_issue_data" + ), patch( + "github_activity_tracker.sync.issues_and_prs.save_issue_raw_source" + ), patch( + "github_activity_tracker.sync.issues_and_prs.get_issue_json_path", + return_value=mock_json_path, + ): + sync_issues_and_prs(mock_repo) - + # Should write, then unlink mock_json_path.parent.mkdir.assert_called_once() mock_json_path.write_text.assert_called_once() From 47bdf3ff97a3e1adab3662d60d4911eede5d2fec Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 25 Mar 2026 03:57:29 -0400 Subject: [PATCH 29/76] fix(github): reconcile issue/PR labels and PR assignees; document unified start bounds - #125 --- clang_github_tracker/sync_raw.py | 14 +++++-- .../sync/issues_and_prs.py | 40 ++++++++++++++----- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 188836d..2ce62f9 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -78,8 +78,15 @@ def sync_raw_only( Args: start_commit: Start date for commits (None = from beginning). - start_issue: Start date for issues (None = from beginning). - start_pr: Start date for PRs (None = from beginning). + start_issue: Issue watermark for the unified issues+PRs fetch (one ``/issues`` + list with both item kinds). ``None`` only means “no issue cursor” when + deriving the shared start: if ``start_pr`` is also ``None``, the unified + fetch runs from the beginning; if ``start_pr`` is set, that timestamp is + used as the single lower bound for the whole list (issues are filtered + by the same window). When both ``start_issue`` and ``start_pr`` are set, + the shared lower bound is the **later** of the two (``max``), so one + GitHub query covers both types from that time forward. + start_pr: PR watermark; same shared-bound semantics as ``start_issue``. end_date: End date for all (default: now). Returns: @@ -106,7 +113,8 @@ def sync_raw_only( latest_issue: datetime | None = None latest_pr: datetime | None = None - # Derive a single start date for the unified issue+PR fetch: earliest of the two. + # Single lower bound for the unified /issues fetch: later of the two when both + # watermarks exist; otherwise whichever side is initialized (or None if both). if start_issue and start_pr: start_item = max(start_issue, start_pr) else: diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py index 2f95c8f..2d03858 100644 --- a/github_activity_tracker/sync/issues_and_prs.py +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -107,10 +107,18 @@ def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: ) services.add_issue_assignee(issue_obj, assignee_account) - for label_data in issue_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_issue_label(issue_obj, label_name) + incoming_label_names = { + (label_data.get("name") or "") + for label_data in issue_data.get("labels", []) + if (label_data.get("name") or "") + } + existing_label_names = { + il.label_name for il in issue_obj.labels.all() if il.label_name + } + for label_name in existing_label_names - incoming_label_names: + services.remove_issue_label(issue_obj, label_name) + for label_name in incoming_label_names - existing_label_names: + services.add_issue_label(issue_obj, label_name) logger.debug("Issue #%s: saved to DB", issue_data.get("number")) @@ -211,8 +219,12 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: pr_review_updated_at=parse_datetime(review_data.get("updated_at")), ) - for assignee_data in pr_data.get("assignees", []): - assignee_info = parse_github_user(assignee_data) + assignee_infos = [parse_github_user(a) for a in pr_data.get("assignees", [])] + current_assignee_ids = {i["account_id"] for i in assignee_infos if i["account_id"]} + for assignee_account in pr_obj.assignees.all(): + if assignee_account.github_account_id not in current_assignee_ids: + services.remove_pr_assignee(pr_obj, assignee_account) + for assignee_info in assignee_infos: if assignee_info["account_id"]: assignee_account, _ = get_or_create_github_account( github_account_id=assignee_info["account_id"], @@ -222,10 +234,18 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: ) services.add_pr_assignee(pr_obj, assignee_account) - for label_data in pr_data.get("labels", []): - label_name = label_data.get("name", "") - if label_name: - services.add_pull_request_label(pr_obj, label_name) + incoming_pr_label_names = { + (label_data.get("name") or "") + for label_data in pr_data.get("labels", []) + if (label_data.get("name") or "") + } + existing_pr_label_names = { + pl.label_name for pl in pr_obj.labels.all() if pl.label_name + } + for label_name in existing_pr_label_names - incoming_pr_label_names: + services.remove_pull_request_label(pr_obj, label_name) + for label_name in incoming_pr_label_names - existing_pr_label_names: + services.add_pull_request_label(pr_obj, label_name) logger.debug("PR #%s: saved to DB", pr_data.get("number")) From fcad9d9072fe803413f76a4eca72f753e344f42b Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 25 Mar 2026 04:58:03 -0400 Subject: [PATCH 30/76] fix(github_activity_tracker): paginate commits without rel=last; harden issues/PRs start_date - #125 --- github_activity_tracker/fetcher.py | 88 +++++++++++++------ .../sync/issues_and_prs.py | 35 +++++--- .../tests/test_fetcher_commits_backward.py | 31 +++++++ .../tests/test_sync_issues_and_prs.py | 10 +-- 4 files changed, 120 insertions(+), 44 deletions(-) diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index deef12b..ac25af9 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -117,11 +117,12 @@ def fetch_commits_from_github( end_time: Optional[datetime] = None, etag_cache: Optional[Any] = None, ) -> Iterator[dict]: - """Fetch commits from GitHub API oldest-to-newest using Link header backward traversal. + """Fetch commits from GitHub API oldest-to-newest using Link header pagination. - Fetches page 1 to discover the last page via the Link header, then walks backward - (last → prev → … → page 1), yielding commits in chronological order (oldest first) - within each page by reversing the newest-first GitHub default. + When GitHub includes rel="last", walks backward (last → prev → … → page 1) so + commits are yielded oldest-first. When rel="last" is omitted but rel="next" + is present (e.g. some since/until responses), follows "next" to fetch all + pages, then yields oldest-first. True single-page responses have neither link. The page-1 list response is cached in memory so when backward traversal returns to page 1 via the "prev" link, no duplicate request is made. @@ -170,42 +171,71 @@ def fetch_commits_from_github( ) last_url = first_page_links.get("last") + next_url = first_page_links.get("next") + + if last_url and not _is_first_page_url(last_url): + # Multiple pages: walk backward from last page to page 1, yielding oldest-first. + current_url: Optional[str] = last_url + while current_url is not None: + if _is_first_page_url(current_url): + # Reuse the already-fetched page-1 data — no extra API request. + page_data = first_page_data + page_links = first_page_links + logger.debug("Backward traversal reached page 1; using cached data") + else: + page_data, page_links = client.rest_request_url_with_all_links( + current_url + ) + logger.debug( + "Fetched %d commits (backward traversal) from %s", + len(page_data) if page_data else 0, + current_url, + ) + time.sleep(0.2) + + for commit in reversed(page_data or []): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + + current_url = page_links.get("prev") - if not last_url or _is_first_page_url(last_url): - # Single page: reverse to yield oldest-first, then cache ETag and return. - logger.debug("Single page of commits; processing in reverse order") - for commit in reversed(first_page_data): - yield from _yield_commit_with_stats( - client, owner, repo, commit, start_time, end_time - ) if etag_cache is not None and first_page_etag: etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) return - # Multiple pages: walk backward from last page to page 1, yielding oldest-first. - current_url: Optional[str] = last_url - while current_url is not None: - if _is_first_page_url(current_url): - # Reuse the already-fetched page-1 data — no extra API request. - page_data = first_page_data - page_links = first_page_links - logger.debug("Backward traversal reached page 1; using cached data") - else: - page_data, page_links = client.rest_request_url_with_all_links(current_url) + if next_url: + # rel="last" omitted but rel="next" is present: fetch remaining pages, oldest-first. + pages: list[list[dict]] = [first_page_data] + current_links = first_page_links + while current_links.get("next"): + forward_url = current_links["next"] + page_data, current_links = client.rest_request_url_with_all_links( + forward_url + ) logger.debug( - "Fetched %d commits (backward traversal) from %s", + "Fetched %d commits (forward pagination) from %s", len(page_data) if page_data else 0, - current_url, + forward_url, ) time.sleep(0.2) + pages.append(page_data or []) - for commit in reversed(page_data or []): - yield from _yield_commit_with_stats( - client, owner, repo, commit, start_time, end_time - ) - - current_url = page_links.get("prev") + for page_data in reversed(pages): + for commit in reversed(page_data): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) + if etag_cache is not None and first_page_etag: + etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) + return + # No pagination: neither next nor a multi-page last link. + logger.debug("Single page of commits; processing in reverse order") + for commit in reversed(first_page_data): + yield from _yield_commit_with_stats( + client, owner, repo, commit, start_time, end_time + ) if etag_cache is not None and first_page_etag: etag_cache.set("commits", 1, since_iso, until_iso, first_page_etag) diff --git a/github_activity_tracker/sync/issues_and_prs.py b/github_activity_tracker/sync/issues_and_prs.py index 2d03858..d893ba0 100644 --- a/github_activity_tracker/sync/issues_and_prs.py +++ b/github_activity_tracker/sync/issues_and_prs.py @@ -123,7 +123,9 @@ def _process_issue_data(repo: GitHubRepository, issue_data: dict) -> None: logger.debug("Issue #%s: saved to DB", issue_data.get("number")) -def _process_existing_issue_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: +def _process_existing_issue_jsons( + repo: GitHubRepository, +) -> tuple[int, list[int]]: """Load each issues/*.json in workspace for this repo, save to DB, remove file. Returns: @@ -250,7 +252,9 @@ def _process_pr_data(repo: GitHubRepository, pr_data: dict) -> None: logger.debug("PR #%s: saved to DB", pr_data.get("number")) -def _process_existing_pr_jsons(repo: GitHubRepository) -> tuple[int, list[int]]: +def _process_existing_pr_jsons( + repo: GitHubRepository, +) -> tuple[int, list[int]]: """Load each prs/*.json in workspace for this repo, save to DB, remove file. Returns: @@ -283,7 +287,7 @@ def sync_issues_and_prs( """Sync issues and PRs for a repo using a single GitHub /issues list call. 1. Process any existing issue/PR JSON files left from a previous interrupted run. - 2. Determine the start date as the earliest of the last-seen issue and PR update times. + 2. Determine the start date as the later (max) of the last-seen issue and PR update times. 3. Fetch items via fetch_issues_and_prs_from_github; each item is routed by key: - "issue_info" → persisted as an issue - "pr_info" → persisted as a pull request @@ -297,7 +301,9 @@ def sync_issues_and_prs( {"issues": [], "pull_requests": []} """ logger.info( - "sync_issues_and_prs: starting for repo id=%s (%s)", repo.pk, repo.repo_name + "sync_issues_and_prs: starting for repo id=%s (%s)", + repo.pk, + repo.repo_name, ) owner = repo.owner_account.username @@ -318,22 +324,24 @@ def sync_issues_and_prs( n_prs, ) - # Phase 2: determine start date from the earliest of last issue / last PR update. + # Phase 2: determine start date as max(last issue, last PR) +1s — shared /issues timeline. if start_date is None: last_issue = repo.issues.order_by("-issue_updated_at").first() last_pr = repo.pull_requests.order_by("-pr_updated_at").first() issue_date = ( (last_issue.issue_updated_at + timedelta(seconds=1)) - if last_issue + if last_issue and last_issue.issue_updated_at is not None else None ) pr_date = ( - (last_pr.pr_updated_at + timedelta(seconds=1)) if last_pr else None + (last_pr.pr_updated_at + timedelta(seconds=1)) + if last_pr and last_pr.pr_updated_at is not None + else None ) if issue_date and pr_date: - start_date = min(issue_date, pr_date) + start_date = max(issue_date, pr_date) else: start_date = issue_date or pr_date @@ -344,7 +352,12 @@ def sync_issues_and_prs( count_prs = 0 for item in fetcher.fetch_issues_and_prs_from_github( - client, owner, repo_name, start_date, end_date, etag_cache=etag_cache + client, + owner, + repo_name, + start_date, + end_date, + etag_cache=etag_cache, ): if "pr_info" in item: pr_number = (item["pr_info"] or {}).get("number") @@ -392,7 +405,9 @@ def sync_issues_and_prs( raise except Exception as e: logger.exception( - "sync_issues_and_prs: unexpected error for repo id=%s: %s", repo.pk, e + "sync_issues_and_prs: unexpected error for repo id=%s: %s", + repo.pk, + e, ) raise diff --git a/github_activity_tracker/tests/test_fetcher_commits_backward.py b/github_activity_tracker/tests/test_fetcher_commits_backward.py index 60cc9f8..35addf4 100644 --- a/github_activity_tracker/tests/test_fetcher_commits_backward.py +++ b/github_activity_tracker/tests/test_fetcher_commits_backward.py @@ -29,6 +29,37 @@ def test_fetch_commits_single_page_yields_oldest_first(): assert [c["sha"] for c in commits] == ["c1", "c2", "c3"] +def test_fetch_commits_next_without_last_forward_pagination(): + """When rel=last is omitted but rel=next is present, follow next for all pages.""" + client = MagicMock() + + client.rest_request_with_all_links.return_value = ( + [{"sha": "c3", "commit": {"author": {"date": "2024-01-03T00:00:00Z"}}}], + {"next": "https://api.github.com/repos/o/r/commits?page=2"}, + ) + + client.rest_request_url_with_all_links.side_effect = [ + ( + [{"sha": "c2", "commit": {"author": {"date": "2024-01-02T00:00:00Z"}}}], + {"next": "https://api.github.com/repos/o/r/commits?page=3"}, + ), + ( + [{"sha": "c1", "commit": {"author": {"date": "2024-01-01T00:00:00Z"}}}], + {}, + ), + ] + + client.rest_request.side_effect = lambda url: { + "sha": url.split("/")[-1], + "stats": {}, + } + + commits = list(fetch_commits_from_github(client, "owner", "repo")) + + assert [c["sha"] for c in commits] == ["c1", "c2", "c3"] + assert client.rest_request_url_with_all_links.call_count == 2 + + def test_fetch_commits_multiple_pages_backward_traversal(): """fetch_commits_from_github walks backward from last page to first.""" client = MagicMock() diff --git a/github_activity_tracker/tests/test_sync_issues_and_prs.py b/github_activity_tracker/tests/test_sync_issues_and_prs.py index ded6124..aca2c33 100644 --- a/github_activity_tracker/tests/test_sync_issues_and_prs.py +++ b/github_activity_tracker/tests/test_sync_issues_and_prs.py @@ -62,10 +62,10 @@ def test_sync_issues_and_prs_processes_both_types( @patch("github_activity_tracker.sync.issues_and_prs.fetcher") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_issue_jsons") @patch("github_activity_tracker.sync.issues_and_prs._process_existing_pr_jsons") -def test_sync_issues_and_prs_uses_min_start_date( +def test_sync_issues_and_prs_uses_max_start_date( mock_existing_prs, mock_existing_issues, mock_fetcher, mock_get_client ): - """sync_issues_and_prs uses earliest of last_issue and last_pr updated_at as start_date.""" + """sync_issues_and_prs uses the later of last_issue and last_pr (+1s) as start_date.""" mock_repo = MagicMock() mock_repo.owner_account.username = "owner" mock_repo.repo_name = "repo" @@ -75,7 +75,7 @@ def test_sync_issues_and_prs_uses_min_start_date( mock_last_issue.issue_updated_at = datetime(2024, 1, 5, tzinfo=timezone.utc) mock_repo.issues.order_by.return_value.first.return_value = mock_last_issue - # Last PR updated at 2024-01-03 (earlier) + # Last PR updated at 2024-01-03 (older than last issue) mock_last_pr = MagicMock() mock_last_pr.pr_updated_at = datetime(2024, 1, 3, tzinfo=timezone.utc) mock_repo.pull_requests.order_by.return_value.first.return_value = mock_last_pr @@ -89,10 +89,10 @@ def test_sync_issues_and_prs_uses_min_start_date( sync_issues_and_prs(mock_repo) - # Should use 2024-01-03 + 1s (earliest) + # Should use max(issue_date, pr_date) → 2024-01-05 + 1s call_args = mock_fetcher.fetch_issues_and_prs_from_github.call_args start_date = call_args[0][3] # Fourth positional arg - assert start_date == datetime(2024, 1, 3, 0, 0, 1, tzinfo=timezone.utc) + assert start_date == datetime(2024, 1, 5, 0, 0, 1, tzinfo=timezone.utc) @patch("github_activity_tracker.sync.issues_and_prs.get_github_client") From 1c9c741c9fc8857d92314c3a3a56f4b9565cf4d6 Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Mar 2026 22:16:48 +0800 Subject: [PATCH 31/76] #126-fixed this app and cppa-pinecone app --- boost_library_docs_tracker/fetcher.py | 8 +- .../run_boost_library_docs_tracker.py | 68 ++++--- boost_library_docs_tracker/preprocessor.py | 79 +++++--- .../tests/test_preprocessor.py | 84 +++++++++ cppa_pinecone_sync/sync.py | 171 +++++------------- cppa_pinecone_sync/tests/test_sync.py | 41 +++++ 6 files changed, 254 insertions(+), 197 deletions(-) diff --git a/boost_library_docs_tracker/fetcher.py b/boost_library_docs_tracker/fetcher.py index 7902cb1..a5ba6de 100644 --- a/boost_library_docs_tracker/fetcher.py +++ b/boost_library_docs_tracker/fetcher.py @@ -76,9 +76,9 @@ def download_source_zip(version: str, dest_dir: Path) -> Path: zip_name = f"boost_{normalized.replace('.', '_')}.zip" zip_path = dest_dir / zip_name - # if zip_path.exists(): - # logger.info("Source zip already present, skipping download: %s", zip_path) - # return zip_path + if zip_path.exists(): + logger.info("Source zip already present, skipping download: %s", zip_path) + return zip_path dest_dir.mkdir(parents=True, exist_ok=True) session = _get_session() @@ -327,7 +327,7 @@ def crawl_library_pages( abs_url = abs_url.split("#")[0] if ( abs_url not in visited - and abs_url.startswith(start_url) + and lib_key.split("/")[-1] not in abs_url and abs_url not in queue ): queue.append(abs_url) diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index ac6ef24..98fb6e5 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -38,7 +38,6 @@ import logging from pathlib import Path -from django.apps import apps from django.core.management.base import BaseCommand, CommandError from boost_library_docs_tracker import fetcher, services, workspace @@ -47,8 +46,8 @@ logger = logging.getLogger(__name__) -APP_TYPE = "boost_library_docs" -PINECONE_NAMESPACE = "boost_library_docs" +APP_TYPE = "boost-library-documentation" +PINECONE_NAMESPACE = "boost-library-documentation" DEFAULT_MAX_PAGES = 10 @@ -146,6 +145,7 @@ def _run( ) mode = "local-zip" if use_local else "HTTP crawl" self.stdout.write(f"Scrape mode: {mode}") + versions = [f"boost-1.{i}.0" for i in range(63, 91)] for version in versions: self._process_version( @@ -157,10 +157,10 @@ def _run( cleanup_extract=cleanup_extract, ) - if dry_run or skip_pinecone: - reason = "dry run" if dry_run else "--skip-pinecone set" - self.stdout.write(f"Skipping Pinecone sync ({reason}).") - return + if dry_run or skip_pinecone: + reason = "dry run" if dry_run else "--skip-pinecone set" + self.stdout.write(f"Skipping Pinecone sync ({reason}).") + continue self._sync_pinecone() @@ -213,9 +213,9 @@ def _prepare_local_source(self, *, version: str) -> Path: zip_dir = workspace.get_zip_dir() extract_dir = workspace.get_extract_dir() - if zip_dir.exists(): - self.stdout.write(f"[{version}] Source zip already exists at {zip_dir}") - return extract_dir + # if zip_dir.exists(): + # self.stdout.write(f"[{version}] Source zip already exists at {zip_dir}") + # return extract_dir try: zip_path = fetcher.download_source_zip(version, zip_dir) @@ -355,16 +355,7 @@ def _save_pages_to_workspace_and_db( # ------------------------------------------------------------------ def _sync_pinecone(self): - if not apps.is_installed("cppa_pinecone_sync"): - self.stdout.write( - self.style.WARNING( - "Skipping Pinecone sync: 'cppa_pinecone_sync' is not in INSTALLED_APPS." - ) - ) - self.stdout.write( - "Add 'cppa_pinecone_sync' to INSTALLED_APPS or run with --skip-pinecone." - ) - return + """Sync to Pinecone""" try: from cppa_pinecone_sync.sync import sync_to_pinecone @@ -477,27 +468,32 @@ def _get_library_list(self, version: str) -> list[tuple[Path, str]]: result.append((start_path, lib_key)) return result - def _resolve_library_version_id(self, lib_name: str, version: str) -> int | None: + def _resolve_library_version_id(self, lib_key: str, version: str) -> int | None: """Resolve BoostLibraryVersion id from DB. Returns None if not found.""" - try: - lv = BoostLibraryVersion.objects.select_related("library", "version").get( - library__name=lib_name, - version__version=version, - ) - return lv.pk - except BoostLibraryVersion.DoesNotExist: + lib_key = (lib_key or "").strip() + if not lib_key: return None - except BoostLibraryVersion.MultipleObjectsReturned: + + base_qs = BoostLibraryVersion.objects.select_related( + "library", "version" + ).filter(version__version=version) + # 1) Preferred: key + version + qs = base_qs.filter(key=lib_key) + lv = qs.first() + if lv: + return lv.pk + + # 2) Optional compatibility fallback: name + version + qs = base_qs.filter(library__name=lib_key) + lv = qs.first() + if lv: logger.warning( - "Multiple BoostLibraryVersion rows for lib=%s ver=%s; using first.", - lib_name, + "Resolved by library name fallback (missing/mismatched key): lib_key=%s, version=%s", + lib_key, version, ) - lv = BoostLibraryVersion.objects.filter( - library__name=lib_name, - version__version=version, - ).first() - return lv.pk if lv is not None else None + return lv.pk + return None def _resolve_boost_version_id(self, version: str) -> int | None: """Resolve BoostVersion PK from the version string. Returns None if not found.""" diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index 2b36c19..32e8019 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -4,8 +4,11 @@ Called by cppa_pinecone_sync.sync.sync_to_pinecone as the preprocess_fn argument. Signature matches the PreprocessFn contract: (failed_ids: list[str], final_sync_at: datetime | None) - -> tuple[list[dict], bool] - OR tuple[list[dict], bool, list[dict]] (with metadata updates) + -> tuple[list[dict], bool, list[dict]] + +The third list is metas_to_update: already-upserted rows whose scraped_at is +after final_sync_at (metadata refresh in Pinecone). Empty when final_sync_at +is None or nothing is stale. The failed_ids values come from failed_documents[*]["ids"] in the upsert result, which are BoostDocContent PKs encoded as strings. @@ -26,30 +29,32 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, -) -> tuple[list[dict[str, Any]], bool]: +) -> tuple[list[dict[str, Any]], bool, list[dict[str, Any]]]: """ - Build documents for Pinecone upsert from BoostDocContent records. + Build documents for Pinecone upsert and optional metadata updates. + + Upsert batch: BoostDocContent where is_upserted=False or PK is in failed_ids + (retry). Loads page text from the workspace for each row. - Selects BoostDocContent records where is_upserted=False (not yet synced) - or whose PK is in failed_ids (retry after a previous failure). - final_sync_at is accepted for interface compatibility but is not used — - is_upserted is the authoritative sync state. + Metadata batch (metas_to_update): when final_sync_at is set, rows with + is_upserted=True and scraped_at > final_sync_at (re-scraped after last sync), + excluding failed_ids. Same document shape as the upsert batch so + ingestion.update_documents can refresh metadata; doc_id remains content_hash. - For each selected record: - - Resolves first_version / last_version from the FK fields on BoostDocContent. - - Loads page content from the workspace file. - - Returns source ids in metadata["ids"] so the caller can mark - BoostDocContent.is_upserted=True only after a successful Pinecone upsert. + When final_sync_at is None, metas_to_update is always [] (no incremental + stale-metadata pass). - Returns (documents, is_chunked=False). - doc_id in metadata is the content_hash of the BoostDocContent row. + Returns (documents, is_chunked=False, metas_to_update). """ - records = _select_records(failed_ids, final_sync_at) - if not records: - return [], False + upsert_records = _select_upsert_records(failed_ids) + meta_records = _select_metadata_update_records(failed_ids, final_sync_at) + + if not upsert_records and not meta_records: + return [], False, [] - documents, _ids_to_mark = _build_documents(records) - return documents, False + documents, _ = _build_documents(upsert_records) + metas_to_update, _ = _build_documents(meta_records) + return documents, False, metas_to_update # --------------------------------------------------------------------------- @@ -57,16 +62,8 @@ def preprocess_for_pinecone( # --------------------------------------------------------------------------- -def _select_records( - failed_ids: list[str], - final_sync_at: datetime | None, -) -> list[BoostDocContent]: - """Return BoostDocContent records to process. - - Selects rows that are not yet upserted (is_upserted=False) or are in - failed_ids for retry. The final_sync_at parameter is accepted for interface - compatibility but is not used — is_upserted is the authoritative sync state. - """ +def _select_upsert_records(failed_ids: list[str]) -> list[BoostDocContent]: + """Rows to vector-upsert: not yet upserted or explicitly failed (retry).""" from django.db.models import Q int_failed_ids = _parse_int_ids(failed_ids) @@ -82,6 +79,28 @@ def _select_records( return list(qs) +def _select_metadata_update_records( + failed_ids: list[str], + final_sync_at: datetime | None, +) -> list[BoostDocContent]: + """Rows needing Pinecone metadata refresh only (already upserted, scraped since sync).""" + if final_sync_at is None: + return [] + + int_failed_ids = _parse_int_ids(failed_ids) + qs = ( + BoostDocContent.objects.filter( + is_upserted=True, + scraped_at__gt=final_sync_at, + ) + .select_related("first_version", "last_version") + .order_by("id") + ) + if int_failed_ids: + qs = qs.exclude(pk__in=int_failed_ids) + return list(qs) + + def _parse_int_ids(failed_ids: list[str]) -> list[int]: """Convert string IDs to ints, skipping malformed values. diff --git a/boost_library_docs_tracker/tests/test_preprocessor.py b/boost_library_docs_tracker/tests/test_preprocessor.py index 5fb482a..5137c4e 100644 --- a/boost_library_docs_tracker/tests/test_preprocessor.py +++ b/boost_library_docs_tracker/tests/test_preprocessor.py @@ -1,13 +1,17 @@ """Tests for boost_library_docs_tracker.preprocessor.""" from datetime import timedelta +from unittest.mock import patch import pytest from django.utils import timezone from boost_library_docs_tracker import preprocessor, services +from boost_library_docs_tracker.models import BoostDocContent from boost_library_tracker import services as boost_library_services +_PAGE = "x" * 200 # long enough for downstream chunk validation if needed + @pytest.mark.django_db def test_get_library_name_uses_latest_relation( @@ -53,3 +57,83 @@ def test_get_library_name_uses_latest_relation( def test_get_library_name_returns_empty_without_relation(boost_doc_content): """_get_library_name returns an empty string when no relation exists.""" assert preprocessor._get_library_name(boost_doc_content) == "" + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_metas_when_upserted_and_scraped_after_final_sync( + _mock_load, + boost_doc_content, +): + """Stale upserted rows (scraped_at > final_sync_at) appear in metas_to_update only.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now, + ) + final_sync = now - timedelta(hours=1) + docs, chunked, metas = preprocessor.preprocess_for_pinecone([], final_sync) + assert docs == [] + assert chunked is False + assert len(metas) == 1 + assert metas[0]["metadata"]["doc_id"] == boost_doc_content.content_hash + assert metas[0]["metadata"]["ids"] == str(boost_doc_content.pk) + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_no_metas_when_final_sync_at_none(_mock_load, boost_doc_content): + """With final_sync_at None, metas_to_update is empty (no stale-metadata scan).""" + docs, chunked, metas = preprocessor.preprocess_for_pinecone([], None) + assert chunked is False + assert metas == [] + assert len(docs) == 1 + assert docs[0]["metadata"]["ids"] == str(boost_doc_content.pk) + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_metas_empty_when_scraped_before_final_sync( + _mock_load, + boost_doc_content, +): + """Upserted row scraped before final_sync_at is not in metas_to_update.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now - timedelta(hours=2), + ) + final_sync = now - timedelta(hours=1) + docs, _, metas = preprocessor.preprocess_for_pinecone([], final_sync) + assert docs == [] + assert metas == [] + + +@pytest.mark.django_db +@patch( + "boost_library_docs_tracker.preprocessor.workspace.load_page_by_url", + return_value=_PAGE, +) +def test_preprocess_meta_excludes_failed_ids(_mock_load, boost_doc_content): + """Rows in failed_ids are not selected for metadata-only update.""" + now = timezone.now() + BoostDocContent.objects.filter(pk=boost_doc_content.pk).update( + is_upserted=True, + scraped_at=now, + ) + final_sync = now - timedelta(hours=1) + docs, _, metas = preprocessor.preprocess_for_pinecone( + [str(boost_doc_content.pk)], final_sync + ) + assert metas == [] + assert len(docs) == 1 + assert docs[0]["metadata"]["ids"] == str(boost_doc_content.pk) diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 33e5f2e..6b838ce 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -1,54 +1,33 @@ """ - Main entry point for Pinecone sync. - - Other apps call ``sync_to_pinecone()`` to push their data into Pinecone. - This module orchestrates the full flow: - - 1. Collect failed IDs and last sync timestamp from the database. - 2. Call the caller-provided preprocessing function to get documents. - 3. Upsert documents to Pinecone via PineconeIngestion. - 4. Update the fail list and sync status in the database. - - See docs/pinecone_sync.md for the full specification. - """ from __future__ import annotations - import logging - from datetime import datetime - from typing import Any, Callable, Optional - from django.db import transaction - from . import services - from .ingestion import PineconeIngestion, PineconeInstance - logger = logging.getLogger(__name__) - # Module-level singletons keyed by instance; created on first use so that # Django settings are available and Pinecone libraries are imported only when # needed. - _ingestion_pool: dict[str, PineconeIngestion] = {} @@ -56,24 +35,17 @@ def _get_ingestion( instance: PineconeInstance = PineconeInstance.PUBLIC, ) -> PineconeIngestion: """Return (and lazily create) a PineconeIngestion for *instance*.""" - key = instance.value - if key not in _ingestion_pool: - _ingestion_pool[key] = PineconeIngestion(instance=instance) - return _ingestion_pool[key] # Type alias for the preprocessing function that callers must supply. - # Signature: - # - legacy: (failed_ids, final_sync_at) -> (raw_documents, is_chunked) - -# - metadata update: (failed_ids, final_sync_at) -> (raw_documents, is_chunked, metas_to_update) - +# - metadata update: (failed_ids, final_sync_at) -> +# (raw_documents, is_chunked, metas_to_update) PreprocessFn = Callable[ [list[str], Optional[datetime]], tuple[list[dict[str, Any]], bool] @@ -83,7 +55,6 @@ def _get_ingestion( def _empty_sync_result() -> dict[str, Any]: """Return the standard empty sync result dict.""" - return { "upserted": 0, "updated": 0, @@ -99,30 +70,22 @@ def _empty_sync_result() -> dict[str, Any]: def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: """Convert preprocess output to langchain Documents; skip items missing doc_id/url.""" - from langchain_core.documents import Document documents: list[Any] = [] - for item in raw_documents: - content = item.get("content", "") - metadata = dict(item.get("metadata") or {}) - ids_str = metadata.get("ids") or item.get("ids", "") or "" if "doc_id" not in metadata and "url" not in metadata: - logger.warning( "Skipping document with ids=%s: metadata must contain 'doc_id' or 'url'", ids_str, ) - continue metadata["table_ids"] = ids_str - documents.append(Document(page_content=content, metadata=metadata)) return documents @@ -130,53 +93,30 @@ def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: def _extract_new_failed_ids(result: dict[str, Any]) -> list[str]: """Collect source IDs from failed_documents in the upsert result.""" - new_failed_ids: list[str] = [] - for failed_doc in result.get("failed_documents", []): - ids_str = failed_doc.get("ids", "") - if ids_str: - new_failed_ids.extend( fid.strip() for fid in ids_str.split(",") if fid.strip() ) - return new_failed_ids def _extract_source_ids_from_documents(documents: list[Any]) -> list[str]: - """ - - Collect deduplicated source IDs from Document.metadata.table_ids in order. - - """ - + """Collect deduplicated source IDs from Document.metadata.table_ids in order.""" seen: set[str] = set() - source_ids: list[str] = [] - for doc in documents: - table_ids = str(doc.metadata.get("table_ids", "")).strip() - if not table_ids: - continue - for token in table_ids.split(","): - source_id = token.strip() - if not source_id or source_id in seen: - continue - seen.add(source_id) - source_ids.append(source_id) - return source_ids @@ -188,42 +128,23 @@ def sync_to_pinecone( ) -> dict[str, Any]: """Run a full Pinecone sync cycle for *app_type*. - - This is the **public API** that other apps call. - - Args: - app_type: Identifies the data source (e.g. "slack", "mailing"). Stored as - - CharField in - - PineconeFailList and PineconeSyncStatus. - + CharField in PineconeFailList and PineconeSyncStatus. namespace: Pinecone namespace to upsert into. - - preprocess_fn: A callable returning ``(list[dict], is_chunked)``. Each dict - - must have ``content`` and ``metadata``; ``metadata`` must contain - - ``doc_id`` or ``url``. See docs/Pinecone_preprocess_guideline.md. - + preprocess_fn: A callable returning ``(list[dict], is_chunked)`` or + ``(list[dict], is_chunked, metas_to_update)``. Each dict must have + ``content`` and ``metadata``; ``metadata`` must contain ``doc_id`` + or ``url``. See docs/Pinecone_preprocess_guideline.md. instance: Which Pinecone API key to use (public or private). - Default is public. - - Returns: - dict with keys: upserted, updated, total, failed_count, failed_ids, - errors, update_errors. - """ - logger.info( "sync_to_pinecone: starting app_type=%s namespace=%s instance=%s", app_type, @@ -232,9 +153,7 @@ def sync_to_pinecone( ) failed_ids = services.get_failed_ids(app_type) - final_sync_at = services.get_final_sync_at(app_type) - logger.debug( "app_type=%s: %d previously failed IDs, final_sync_at=%s", app_type, @@ -245,76 +164,75 @@ def sync_to_pinecone( preprocess_result = preprocess_fn(failed_ids, final_sync_at) if len(preprocess_result) == 2: - raw_documents, is_chunked = preprocess_result - metas_to_update: list[dict[str, Any]] = [] - elif len(preprocess_result) == 3: - raw_documents, is_chunked, metas_to_update = preprocess_result - else: - raise ValueError( "preprocess_fn must return either " "(raw_documents, is_chunked) or " "(raw_documents, is_chunked, metas_to_update)" ) - if not raw_documents: - + if not raw_documents and not metas_to_update: logger.info( - "sync_to_pinecone: preprocess returned 0 documents for app_type=%s", + "sync_to_pinecone: preprocess returned 0 upsert docs and 0 metadata " + "updates for app_type=%s", app_type, ) - - services.update_sync_status(app_type) - return _empty_sync_result() - documents = _build_documents_from_raw(raw_documents) - - if not documents: - - services.update_sync_status(app_type) + upsert_documents = _build_documents_from_raw(raw_documents) if raw_documents else [] + meta_documents = ( + _build_documents_from_raw(metas_to_update) if metas_to_update else [] + ) + if not upsert_documents and not meta_documents: + logger.info( + "sync_to_pinecone: no valid documents after filtering for app_type=%s", + app_type, + ) return _empty_sync_result() - attempted_source_ids = _extract_source_ids_from_documents(documents) - ingestion = _get_ingestion(instance) + attempted_source_ids = _extract_source_ids_from_documents(upsert_documents) - result = ingestion.upsert_documents( - documents=documents, namespace=namespace, is_chunked=is_chunked - ) + if upsert_documents: + result = ingestion.upsert_documents( + documents=upsert_documents, + namespace=namespace, + is_chunked=is_chunked, + ) + else: + result = { + "upserted": 0, + "total": 0, + "errors": [], + "failed_documents": [], + } update_result: dict[str, Any] = {"updated": 0, "errors": []} - if metas_to_update: - - documents = _build_documents_from_raw(metas_to_update) - - if not documents: - - services.update_sync_status(app_type) - - return _empty_sync_result() - + if meta_documents: update_result = ingestion.update_documents( - documents=documents, namespace=namespace, is_chunked=is_chunked + documents=meta_documents, + namespace=namespace, + is_chunked=is_chunked, + ) + elif metas_to_update: + logger.warning( + "sync_to_pinecone: metas_to_update produced no valid documents " + "for app_type=%s (skipped metadata update)", + app_type, ) new_failed_ids = _extract_new_failed_ids(result) with transaction.atomic(): - services.clear_failed_ids(app_type) - if new_failed_ids: - services.record_failed_ids(app_type, new_failed_ids) - logger.warning( "app_type=%s: %d source IDs recorded as failed", app_type, @@ -324,7 +242,6 @@ def sync_to_pinecone( services.update_sync_status(app_type) failed_source_ids_set = set(new_failed_ids) - successful_source_ids = [ source_id for source_id in attempted_source_ids diff --git a/cppa_pinecone_sync/tests/test_sync.py b/cppa_pinecone_sync/tests/test_sync.py index bd22395..23118fc 100644 --- a/cppa_pinecone_sync/tests/test_sync.py +++ b/cppa_pinecone_sync/tests/test_sync.py @@ -223,6 +223,47 @@ def preprocess(_failed_ids, _final_sync_at): assert services.get_final_sync_at(app_type) is not None +@pytest.mark.django_db +@patch("cppa_pinecone_sync.sync._get_ingestion") +def test_sync_to_pinecone_metadata_only_calls_update(mock_get_ingestion, app_type): + """Empty upsert batch but non-empty metas_to_update still runs update_documents.""" + mock_ingestion = MagicMock() + mock_ingestion.update_documents.return_value = { + "updated": 3, + "total": 3, + "errors": [], + "failed_documents": [], + } + mock_get_ingestion.return_value = mock_ingestion + + def preprocess(_failed_ids, _final_sync_at): + return ( + [], + False, + [ + { + "ids": "10", + "content": "metadata-only body " * 20, + "metadata": {"doc_id": "h1"}, + }, + ], + ) + + result = sync_to_pinecone(app_type, "meta_ns", preprocess) + + mock_ingestion.upsert_documents.assert_not_called() + mock_ingestion.update_documents.assert_called_once() + call_kw = mock_ingestion.update_documents.call_args[1] + assert call_kw["namespace"] == "meta_ns" + assert len(call_kw["documents"]) == 1 + assert result["upserted"] == 0 + assert result["total"] == 0 + assert result["failed_count"] == 0 + assert result["updated"] == 3 + assert result["failed_ids"] == [] + assert services.get_final_sync_at(app_type) is not None + + @pytest.mark.django_db @patch("cppa_pinecone_sync.sync._get_ingestion") def test_sync_to_pinecone_returns_metadata_update_result(mock_get_ingestion, app_type): From d7788269f54625db846566117654a327004dfed0 Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Mar 2026 22:27:31 +0800 Subject: [PATCH 32/76] #126-fixed ci test errors --- boost_library_docs_tracker/fetcher.py | 15 +++++++++------ cppa_pinecone_sync/tests/test_sync.py | 7 ++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/boost_library_docs_tracker/fetcher.py b/boost_library_docs_tracker/fetcher.py index a5ba6de..5fe6a86 100644 --- a/boost_library_docs_tracker/fetcher.py +++ b/boost_library_docs_tracker/fetcher.py @@ -320,17 +320,20 @@ def crawl_library_pages( # Enqueue in-scope links soup = BeautifulSoup(resp.text, "lxml") + lib_segment = lib_key.split("/")[-1] for a in soup.find_all("a", href=True): href: str = a["href"] abs_url = urljoin(final_url, href) # Strip fragment abs_url = abs_url.split("#")[0] - if ( - abs_url not in visited - and lib_key.split("/")[-1] not in abs_url - and abs_url not in queue - ): - queue.append(abs_url) + if not abs_url.startswith(base_url): + continue + # Stay within this library's doc subtree (path contains lib segment) + if lib_segment not in abs_url: + continue + if abs_url in visited or abs_url in queue: + continue + queue.append(abs_url) logger.debug( "Crawled %d pages for root %s (max_pages=%s)", diff --git a/cppa_pinecone_sync/tests/test_sync.py b/cppa_pinecone_sync/tests/test_sync.py index 23118fc..8f1dbed 100644 --- a/cppa_pinecone_sync/tests/test_sync.py +++ b/cppa_pinecone_sync/tests/test_sync.py @@ -160,8 +160,9 @@ def test_extract_new_failed_ids_skips_empty(): @pytest.mark.django_db -def test_sync_to_pinecone_empty_preprocess_returns_early(app_type): - """sync_to_pinecone returns empty result and updates status when preprocess returns no docs.""" +def test_sync_to_pinecone_empty_preprocess_returns_early(): + """No upsert/metadata work: empty result and PineconeSyncStatus is not touched.""" + app_type = "test_empty_preprocess_sync" def preprocess(_failed_ids, _final_sync_at): return [], False @@ -170,7 +171,7 @@ def preprocess(_failed_ids, _final_sync_at): assert result["upserted"] == 0 assert result["total"] == 0 assert result["failed_ids"] == [] - assert services.get_final_sync_at(app_type) is not None + assert services.get_final_sync_at(app_type) is None @pytest.mark.django_db From 5532eb4be34eeefb4717191d651604250e2e45fa Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Mar 2026 23:13:03 +0800 Subject: [PATCH 33/76] #126-added the removing logic for downloaded zip file --- .../run_boost_library_docs_tracker.py | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index 98fb6e5..91a0c24 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -11,10 +11,11 @@ 3. For each library, fetch docs and save to workspace: - Default (--use-local not set): HTTP BFS crawl per library. - --use-local: download source zip once per version, extract, walk local HTML. - Zip is saved in workspace/raw/boost_library_docs_tracker/ and is not deleted. + Zip is saved in workspace/raw/boost_library_docs_tracker/. Extract tree is saved in workspace/boost_library_docs_tracker/extracted/. Converted page content is saved in workspace/boost_library_docs_tracker/converted/. - Pass --cleanup-extract to delete the extract tree after all libraries are done. + Pass --cleanup-extract to delete the extract tree and the downloaded zip after + all libraries for that version are done. 4. Fill BoostDocContent and BoostLibraryDocumentation tables (no page_content in DB). - New content_hash → create new BoostDocContent row, set first_version and last_version. - Same content_hash but different URL → update url and scraped_at, update last_version. @@ -102,8 +103,8 @@ def add_arguments(self, parser): "--cleanup-extract", action="store_true", help=( - "Delete the extracted source tree after all libraries for a version are " - "processed (only with --use-local)." + "Delete the extracted source tree and the raw zip under workspace/raw/ " + "after all libraries for a version are processed (only with --use-local)." ), ) @@ -145,7 +146,7 @@ def _run( ) mode = "local-zip" if use_local else "HTTP crawl" self.stdout.write(f"Scrape mode: {mode}") - versions = [f"boost-1.{i}.0" for i in range(63, 91)] + versions = [f"boost-1.{i}.0" for i in range(64, 91)] for version in versions: self._process_version( @@ -157,10 +158,10 @@ def _run( cleanup_extract=cleanup_extract, ) - if dry_run or skip_pinecone: - reason = "dry run" if dry_run else "--skip-pinecone set" - self.stdout.write(f"Skipping Pinecone sync ({reason}).") - continue + if dry_run or skip_pinecone: + reason = "dry run" if dry_run else "--skip-pinecone set" + self.stdout.write(f"Skipping Pinecone sync ({reason}).") + return self._sync_pinecone() @@ -180,9 +181,10 @@ def _process_version( self.stdout.write(f"[{version}] {len(library_list)} library/libraries.") source_root: Path | None = None + zip_path: Path | None = None if use_local: - source_root = self._prepare_local_source(version=version) + source_root, zip_path = self._prepare_local_source(version=version) # Resolve once per version; used to track first/last_version on BoostDocContent. boost_version_id = self._resolve_boost_version_id(version) @@ -202,13 +204,29 @@ def _process_version( if use_local and cleanup_extract and source_root is not None: fetcher.delete_extract_dir(source_root) + if zip_path is not None: + try: + zip_path.unlink(missing_ok=True) + self.stdout.write( + self.style.NOTICE( + f"[{version}] Removed source zip {zip_path.name}" + ) + ) + except OSError as exc: + logger.warning("Could not remove source zip %s: %s", zip_path, exc) + self.stdout.write( + self.style.WARNING( + f"[{version}] Could not remove source zip: {exc}" + ) + ) self.stdout.write(f"[{version}] Done — {total_pages} pages total.") - def _prepare_local_source(self, *, version: str) -> Path: + def _prepare_local_source(self, *, version: str) -> tuple[Path, Path]: """Download and extract the Boost source zip for a version. - Returns source_root — the top-level extracted directory. + Returns (source_root, zip_path): top-level extracted directory and path to + the zip under workspace/raw/boost_library_docs_tracker/. """ zip_dir = workspace.get_zip_dir() extract_dir = workspace.get_extract_dir() @@ -232,7 +250,7 @@ def _prepare_local_source(self, *, version: str) -> Path: ) from exc self.stdout.write(f"[{version}] Source ready at {source_root}") - return source_root + return source_root, zip_path def _process_library( self, From 6ac0142540396f7fa3dc8d72304a6ceb0e23205d Mon Sep 17 00:00:00 2001 From: zho Date: Wed, 25 Mar 2026 23:57:31 +0800 Subject: [PATCH 34/76] #126-removed the seen in sync.py --- cppa_pinecone_sync/sync.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 6b838ce..5afc585 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -105,7 +105,6 @@ def _extract_new_failed_ids(result: dict[str, Any]) -> list[str]: def _extract_source_ids_from_documents(documents: list[Any]) -> list[str]: """Collect deduplicated source IDs from Document.metadata.table_ids in order.""" - seen: set[str] = set() source_ids: list[str] = [] for doc in documents: table_ids = str(doc.metadata.get("table_ids", "")).strip() @@ -113,9 +112,8 @@ def _extract_source_ids_from_documents(documents: list[Any]) -> list[str]: continue for token in table_ids.split(","): source_id = token.strip() - if not source_id or source_id in seen: + if not source_id or source_id in source_ids: continue - seen.add(source_id) source_ids.append(source_id) return source_ids From fe5ec767322c98900e3e7518429ef4251bfdf032 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 25 Mar 2026 13:26:02 -0400 Subject: [PATCH 35/76] Remove fetch issue and pr functions respectively in fetcher function - #125 --- github_activity_tracker/fetcher.py | 214 ------------------ github_activity_tracker/tests/test_fetcher.py | 102 --------- 2 files changed, 316 deletions(-) diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index ac25af9..71d3490 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -301,125 +301,6 @@ def fetch_comments_from_github( return results -def fetch_issues_from_github( - client: GitHubAPIClient, - owner: str, - repo: str, - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - etag_cache: Optional[Any] = None, -) -> Iterator[dict]: - """Fetch issues from GitHub API (paginated). Yields issue dicts with comments. - Uses GitHub's Link header (rel=\"next\") for pagination per API docs. - If etag_cache is provided, uses conditional GET for the first page when using endpoint+params. - """ - logger.debug(f"Fetching issues for {owner}/{repo} from {start_time} to {end_time}") - per_page = 100 - since_iso = start_time.isoformat() if start_time else "" - endpoint = f"/repos/{owner}/{repo}/issues" - next_url: Optional[str] = None - page_num = 1 - - while True: - # Fresh each page: rest_request_url does not return an ETag; do not reuse - # page N-1's tag when caching page N (conditional path sets this below). - response_etag: Optional[str] = None - try: - if next_url is not None: - issues, next_url = client.rest_request_url(next_url) - page_num += 1 - else: - params = { - "state": "all", - "per_page": per_page, - "page": page_num, - "sort": "updated", - "direction": "asc", - } - if start_time: - params["since"] = start_time.isoformat() - if etag_cache is not None: - etag = etag_cache.get("issues", page_num, since_iso, "") - data, response_etag, next_url = ( - client.rest_request_conditional_with_link( - endpoint, params=params, etag=etag - ) - ) - if data is None: - logger.debug( - "Issues list page %s: 304 Not Modified, skipping", - page_num, - ) - page_num += 1 - time.sleep(0.2) - continue - issues = data - else: - issues, next_url = client.rest_request_with_link(endpoint, params) - except requests.exceptions.HTTPError as e: - if e.response is not None and e.response.status_code == 422: - logger.debug( - "Issues list: 422 Unprocessable Entity, stopping pagination" - ) - break - raise - - if not issues: - logger.debug("No more issues found") - break - - # Filter out PRs (issues endpoint returns both issues and PRs) - raw_issues = issues - issues = [i for i in raw_issues if "pull_request" not in i] - logger.debug( - "Fetched %s issues (excluding PRs) from page %s", - len(issues), - page_num, - ) - - for issue in issues: - updated_str = issue.get("updated_at") or issue.get("created_at") - if not updated_str: - continue - try: - issue_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) - except (ValueError, TypeError) as e: - logger.debug(f"Failed to parse issue date '{updated_str}': {e}") - continue - - if not _in_date_range(issue_dt, start_time, end_time): - continue - - issue_number = issue.get("number") - if issue_number is not None: - # Fetch full issue detail (list endpoint returns summary only) - try: - full_issue = client.rest_request( - f"/repos/{owner}/{repo}/issues/{issue_number}" - ) - if full_issue and isinstance(full_issue, dict): - issue = full_issue - except Exception as e: - logger.debug("Failed to fetch full issue #%s: %s", issue_number, e) - logger.debug(f"Fetching comments for issue #{issue_number}") - comments = fetch_comments_from_github( - client, owner, repo, issue_number, start_time, end_time - ) - logger.debug( - f"Found {len(comments)} comments for issue #{issue_number}" - ) - # Yield nested format: { issue_info: , comments: [...] } - yield {"issue_info": issue, "comments": comments} - - if etag_cache is not None and response_etag: - etag_cache.set("issues", page_num, since_iso, "", response_etag) - - if next_url is None: - logger.debug('Last page reached (no Link rel="next")') - break - time.sleep(0.2) - - def fetch_pr_reviews_from_github( client: GitHubAPIClient, owner: str, @@ -619,98 +500,3 @@ def fetch_issues_and_prs_from_github( logger.debug('Last page reached (no Link rel="next")') break time.sleep(0.2) - - -def fetch_pull_requests_from_github( - client: GitHubAPIClient, - owner: str, - repo: str, - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - etag_cache: Optional[Any] = None, -) -> Iterator[dict]: - """Fetch pull requests from GitHub API (paginated). Yields PR dicts with comments and reviews. - If etag_cache is provided, uses rest_request_conditional for the list GET. - """ - logger.debug(f"Fetching PRs for {owner}/{repo} from {start_time} to {end_time}") - page = 1 - per_page = 100 - - while True: - params = { - "state": "all", - "per_page": per_page, - "page": page, - "sort": "updated", - "direction": "desc", - } - response_etag = None - if etag_cache is not None: - etag = etag_cache.get("pulls", page, "", "") - data, response_etag = client.rest_request_conditional( - f"/repos/{owner}/{repo}/pulls", params=params, etag=etag - ) - if data is None: - logger.debug("Pulls list page %s: 304 Not Modified, skipping", page) - page += 1 - time.sleep(0.2) - continue - prs = data - else: - prs = client.rest_request(f"/repos/{owner}/{repo}/pulls", params) - - if not prs: - logger.debug(f"No more PRs found at page {page}") - break - - flag = False - for pr in prs: - updated_str = pr.get("updated_at") or pr.get("created_at") - pr_number = pr.get("number") - logger.debug("Fetching PR #%s with updated_str: %s", pr_number, updated_str) - if updated_str: - try: - pr_dt = datetime.fromisoformat(updated_str.replace("Z", "+00:00")) - if start_time and pr_dt < _make_aware(start_time): - flag = True - break - if end_time and pr_dt > _make_aware(end_time): - continue - except Exception as e: - logger.debug("Failed to parse PR date '%s': %s", updated_str, e) - continue - - if pr_number is None: - continue - - # Fetch full PR detail (list endpoint returns summary only) - try: - full_pr = client.rest_request( - f"/repos/{owner}/{repo}/pulls/{pr_number}" - ) - if full_pr and isinstance(full_pr, dict): - pr = full_pr - except Exception as e: - logger.debug("Failed to fetch full PR #%s: %s", pr_number, e) - - logger.debug("Fetching comments for PR #%s", pr_number) - comments = fetch_comments_from_github( - client, owner, repo, pr_number, start_time, end_time - ) - time.sleep(0.2) - logger.debug("Fetching reviews for PR #%s", pr_number) - reviews = fetch_pr_reviews_from_github( - client, owner, repo, pr_number, start_time, end_time - ) - time.sleep(0.2) - # Yield nested format: { pr_info: , comments: [...], reviews: [...] } - yield {"pr_info": pr, "comments": comments, "reviews": reviews} - - if etag_cache is not None and response_etag: - etag_cache.set("pulls", page, "", "", response_etag) - - if len(prs) < per_page or flag: - logger.debug(f"Last page reached (got {len(prs)} PRs, expected {per_page})") - break - page += 1 - time.sleep(0.2) diff --git a/github_activity_tracker/tests/test_fetcher.py b/github_activity_tracker/tests/test_fetcher.py index 47fc87f..22571a7 100644 --- a/github_activity_tracker/tests/test_fetcher.py +++ b/github_activity_tracker/tests/test_fetcher.py @@ -7,9 +7,7 @@ from github_activity_tracker.fetcher import ( fetch_comments_from_github, fetch_commits_from_github, - fetch_issues_from_github, fetch_pr_reviews_from_github, - fetch_pull_requests_from_github, fetch_user_from_github, ) @@ -252,56 +250,6 @@ def test_fetch_comments_from_github_calls_correct_endpoint(): assert "/repos/owner/repo/issues/42/comments" in client.rest_request.call_args[0][0] -# --- fetch_issues_from_github --- - - -def test_fetch_issues_from_github_yields_issue_dicts(): - """fetch_issues_from_github yields nested { issue_info, comments } dicts.""" - client = MagicMock() - # First page via Link-header API (list + next_url); then full issue GET; then comments - client.rest_request_with_link.return_value = ( - [{"number": 1, "title": "Issue 1", "updated_at": "2024-06-01T00:00:00Z"}], - None, - ) - client.rest_request.side_effect = [ - {"number": 1, "title": "Issue 1", "updated_at": "2024-06-01T00:00:00Z"}, - [], # comments for issue 1 - ] - items = list(fetch_issues_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["issue_info"]["number"] == 1 - assert "comments" in items[0] - assert items[0]["comments"] == [] - - -def test_fetch_issues_from_github_filters_out_pulls(): - """fetch_issues_from_github filters out items that have pull_request key.""" - client = MagicMock() - client.rest_request_with_link.return_value = ( - [ - {"number": 1, "pull_request": {}}, - {"number": 2, "updated_at": "2024-06-01T00:00:00Z"}, - ], - None, - ) - client.rest_request.side_effect = [ - {"number": 2, "updated_at": "2024-06-01T00:00:00Z"}, # full issue for #2 - [], # comments for issue 2 - ] - items = list(fetch_issues_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["issue_info"]["number"] == 2 - - -def test_fetch_issues_from_github_stops_on_empty_page(): - """fetch_issues_from_github stops when API returns empty list.""" - client = MagicMock() - client.rest_request_with_link.return_value = ([], None) - items = list(fetch_issues_from_github(client, "owner", "repo")) - assert items == [] - client.rest_request.assert_not_called() - - # --- fetch_pr_reviews_from_github --- @@ -331,53 +279,3 @@ def test_fetch_pr_reviews_from_github_calls_pulls_comments(): fetch_pr_reviews_from_github(client, "owner", "repo", pr_number=3) client.rest_request.assert_called_once() assert "/repos/owner/repo/pulls/3/comments" in client.rest_request.call_args[0][0] - - -# --- fetch_pull_requests_from_github --- - - -def test_fetch_pull_requests_from_github_yields_pr_dicts(): - """fetch_pull_requests_from_github yields nested { pr_info, comments, reviews } dicts.""" - client = MagicMock() - client.rest_request.side_effect = [ - [ - { - "number": 1, - "updated_at": "2024-06-01T00:00:00Z", - "created_at": "2024-05-01T00:00:00Z", - }, - ], - { - "number": 1, - "updated_at": "2024-06-01T00:00:00Z", - "created_at": "2024-05-01T00:00:00Z", - }, # full PR - [], # comments for PR 1 - [], # reviews for PR 1 - ] - items = list(fetch_pull_requests_from_github(client, "o", "r")) - assert len(items) == 1 - assert items[0]["pr_info"]["number"] == 1 - assert "comments" in items[0] - assert "reviews" in items[0] - assert items[0]["comments"] == [] - assert items[0]["reviews"] == [] - - -def test_fetch_pull_requests_from_github_stops_on_empty_page(): - """fetch_pull_requests_from_github stops when API returns empty list.""" - client = MagicMock() - client.rest_request.return_value = [] - items = list(fetch_pull_requests_from_github(client, "owner", "repo")) - assert items == [] - - -def test_fetch_pull_requests_from_github_calls_correct_endpoint(): - """fetch_pull_requests_from_github calls .../pulls with state=all.""" - client = MagicMock() - client.rest_request.return_value = [] - list(fetch_pull_requests_from_github(client, "owner", "repo")) - call_args = client.rest_request.call_args - assert "/repos/owner/repo/pulls" in call_args[0][0] - params = call_args[0][1] or {} - assert params["state"] == "all" From 819634c6393e1e6331f775a90bbbffcfd90654a9 Mon Sep 17 00:00:00 2001 From: zho Date: Thu, 26 Mar 2026 05:02:38 +0800 Subject: [PATCH 36/76] #126-addressed the coderabbitai review results --- boost_library_docs_tracker/fetcher.py | 33 +++++++++++-------- .../run_boost_library_docs_tracker.py | 1 - boost_library_docs_tracker/preprocessor.py | 11 +++---- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/boost_library_docs_tracker/fetcher.py b/boost_library_docs_tracker/fetcher.py index 5fe6a86..66c0186 100644 --- a/boost_library_docs_tracker/fetcher.py +++ b/boost_library_docs_tracker/fetcher.py @@ -321,19 +321,26 @@ def crawl_library_pages( # Enqueue in-scope links soup = BeautifulSoup(resp.text, "lxml") lib_segment = lib_key.split("/")[-1] - for a in soup.find_all("a", href=True): - href: str = a["href"] - abs_url = urljoin(final_url, href) - # Strip fragment - abs_url = abs_url.split("#")[0] - if not abs_url.startswith(base_url): - continue - # Stay within this library's doc subtree (path contains lib segment) - if lib_segment not in abs_url: - continue - if abs_url in visited or abs_url in queue: - continue - queue.append(abs_url) + if not lib_segment: + logger.warning( + "Empty library key segment for lib_key=%r; skipping link discovery for %s", + lib_key, + final_url, + ) + else: + for a in soup.find_all("a", href=True): + href: str = a["href"] + abs_url = urljoin(final_url, href) + # Strip fragment + abs_url = abs_url.split("#")[0] + if not abs_url.startswith(base_url): + continue + # Stay within this library's doc subtree (path contains lib segment) + if lib_segment not in abs_url: + continue + if abs_url in visited or abs_url in queue: + continue + queue.append(abs_url) logger.debug( "Crawled %d pages for root %s (max_pages=%s)", diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index 91a0c24..ea70317 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -146,7 +146,6 @@ def _run( ) mode = "local-zip" if use_local else "HTTP crawl" self.stdout.write(f"Scrape mode: {mode}") - versions = [f"boost-1.{i}.0" for i in range(64, 91)] for version in versions: self._process_version( diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index 32e8019..f8e0534 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -46,8 +46,9 @@ def preprocess_for_pinecone( Returns (documents, is_chunked=False, metas_to_update). """ - upsert_records = _select_upsert_records(failed_ids) - meta_records = _select_metadata_update_records(failed_ids, final_sync_at) + int_failed_ids = _parse_int_ids(failed_ids) + upsert_records = _select_upsert_records(int_failed_ids) + meta_records = _select_metadata_update_records(int_failed_ids, final_sync_at) if not upsert_records and not meta_records: return [], False, [] @@ -62,11 +63,10 @@ def preprocess_for_pinecone( # --------------------------------------------------------------------------- -def _select_upsert_records(failed_ids: list[str]) -> list[BoostDocContent]: +def _select_upsert_records(int_failed_ids: list[int]) -> list[BoostDocContent]: """Rows to vector-upsert: not yet upserted or explicitly failed (retry).""" from django.db.models import Q - int_failed_ids = _parse_int_ids(failed_ids) query = Q(is_upserted=False) if int_failed_ids: query |= Q(pk__in=int_failed_ids) @@ -80,14 +80,13 @@ def _select_upsert_records(failed_ids: list[str]) -> list[BoostDocContent]: def _select_metadata_update_records( - failed_ids: list[str], + int_failed_ids: list[int], final_sync_at: datetime | None, ) -> list[BoostDocContent]: """Rows needing Pinecone metadata refresh only (already upserted, scraped since sync).""" if final_sync_at is None: return [] - int_failed_ids = _parse_int_ids(failed_ids) qs = ( BoostDocContent.objects.filter( is_upserted=True, From 7219b54592036c3aa6d0ef4684774a53e71d780c Mon Sep 17 00:00:00 2001 From: zho Date: Thu, 26 Mar 2026 18:01:59 +0800 Subject: [PATCH 37/76] #126-added version_operations and updated version operation logics --- .../run_boost_library_docs_tracker.py | 8 +- boost_library_docs_tracker/preprocessor.py | 24 ++- boost_library_docs_tracker/services.py | 8 +- .../tests/test_services.py | 6 +- boost_library_tracker/release_check.py | 27 +-- boost_library_usage_dashboard/utils.py | 24 +-- boost_usage_tracker/boost_searcher.py | 24 +-- core/tests/test_boost_version_operations.py | 78 +++++++ core/utils/boost_version_operations.py | 197 ++++++++++++++++++ .../service_api/boost_library_docs_tracker.md | 35 ++-- 10 files changed, 338 insertions(+), 93 deletions(-) create mode 100644 core/tests/test_boost_version_operations.py create mode 100644 core/utils/boost_version_operations.py diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index ea70317..424f37a 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -325,7 +325,7 @@ def _process_library( def _save_pages_to_workspace_and_db( self, *, version, lib_name, lib_version_id, boost_version_id, pages ): - created = changed = unchanged = 0 + created = unchanged = 0 for url, page_text in pages: content_hash = hashlib.sha256(page_text.encode()).hexdigest() @@ -348,8 +348,6 @@ def _save_pages_to_workspace_and_db( if change_type == "created": created += 1 - elif change_type == "content_changed": - changed += 1 else: unchanged += 1 @@ -363,9 +361,7 @@ def _save_pages_to_workspace_and_db( exc, ) - self.stdout.write( - f" [{lib_name}] created={created}, changed={changed}, unchanged={unchanged}." - ) + self.stdout.write(f" [{lib_name}] created={created}, unchanged={unchanged}.") # ------------------------------------------------------------------ # Pinecone sync diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index f8e0534..ea9402a 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -20,6 +20,8 @@ from datetime import datetime from typing import Any +from core.utils.boost_version_operations import encode_boost_version_string + from .models import BoostDocContent from . import workspace @@ -163,17 +165,23 @@ def _build_documents( library_name = _get_library_name(doc_content) + metadata: dict[str, Any] = { + "doc_id": doc_content.content_hash, + "url": doc_content.url, + "library_name": library_name, + "ids": str(doc_content.pk), + } + fk = encode_boost_version_string(first_version_str) + if fk is not None: + metadata["first_version_key"] = fk + lk = encode_boost_version_string(last_version_str) + if lk is not None: + metadata["last_version_key"] = lk + documents.append( { "content": page_content, - "metadata": { - "doc_id": doc_content.content_hash, - "url": doc_content.url, - "first_version": first_version_str, - "last_version": last_version_str, - "library_name": library_name, - "ids": str(doc_content.pk), - }, + "metadata": metadata, } ) ids_to_mark.append(doc_content.pk) diff --git a/boost_library_docs_tracker/services.py b/boost_library_docs_tracker/services.py index b8e5e45..830517e 100644 --- a/boost_library_docs_tracker/services.py +++ b/boost_library_docs_tracker/services.py @@ -37,9 +37,10 @@ def get_or_create_doc_content( - On update: updates last_version to version_id. Returns (doc_content, change_type) where change_type is one of: - "created" — content_hash was not in DB; row inserted. - "content_changed" — URL exists for this hash but url field differs; url updated. - "unchanged" — content_hash already exists; only scraped_at and last_version updated. + "created" — content_hash was not in DB; row inserted. + "unchanged" — content_hash already existed; row may still be updated + (url, scraped_at, last_version / first_version as applicable). The document + body identity is the same hash, not a new page. Raises ValueError if url is empty. """ @@ -69,7 +70,6 @@ def get_or_create_doc_content( if obj.url != normalized_url: obj.url = normalized_url update_fields.append("url") - change_type = "content_changed" if version_id is not None: obj.last_version_id = version_id diff --git a/boost_library_docs_tracker/tests/test_services.py b/boost_library_docs_tracker/tests/test_services.py index 1d47d91..7e4fb42 100644 --- a/boost_library_docs_tracker/tests/test_services.py +++ b/boost_library_docs_tracker/tests/test_services.py @@ -41,8 +41,8 @@ def test_get_or_create_doc_content_unchanged_when_same_hash(): @pytest.mark.django_db -def test_get_or_create_doc_content_content_changed_when_url_differs(): - """get_or_create_doc_content returns 'content_changed' when url differs for same hash.""" +def test_get_or_create_doc_content_unchanged_when_url_differs_same_hash(): + """Same content_hash with a new URL still returns 'unchanged' (hash identity unchanged).""" services.get_or_create_doc_content( url="https://example.com/old-page", content_hash="c" * 64, @@ -51,7 +51,7 @@ def test_get_or_create_doc_content_content_changed_when_url_differs(): url="https://example.com/new-page", content_hash="c" * 64, ) - assert change_type == "content_changed" + assert change_type == "unchanged" obj2.refresh_from_db() assert obj2.url == "https://example.com/new-page" diff --git a/boost_library_tracker/release_check.py b/boost_library_tracker/release_check.py index b3fed25..7b9ec3d 100644 --- a/boost_library_tracker/release_check.py +++ b/boost_library_tracker/release_check.py @@ -12,7 +12,8 @@ """ import logging -import re + +from core.utils.boost_version_operations import parse_stable_boost_release_tag from boost_library_tracker.models import BoostVersion from github_ops.client import GitHubAPIClient @@ -23,29 +24,9 @@ MAIN_OWNER = "boostorg" MAIN_REPO = "boost" -# Only boost-X.Y.Z (three numeric parts, no suffix like -beta, -rc, etc.) -BOOST_TAG_PATTERN = re.compile(r"^boost-(\d+)\.(\d+)\.(\d+)$") MIN_BOOST_VERSION = (1, 16, 1) -def _parse_stable_version(tag_name: str) -> str | None: - """ - If ``tag_name`` is a stable release tag ``boost-X.Y.Z`` with version >= MIN_BOOST_VERSION, - return the canonical tag string (e.g. ``boost-1.90.0``). - - Return ``None`` for non-matching names, pre-release-style tags, or versions below the minimum. - """ - if not tag_name: - return None - m = BOOST_TAG_PATTERN.match(tag_name.strip()) - if not m: - return None - major, minor, patch = int(m.group(1)), int(m.group(2)), int(m.group(3)) - if (major, minor, patch) < MIN_BOOST_VERSION: - return None - return f"boost-{major}.{minor}.{patch}" - - def all_boost_versions_from_api() -> list[tuple[str, str]] | None: """ List stable Boost release tags from GitHub (``/repos/boostorg/boost/tags``). @@ -76,7 +57,9 @@ def all_boost_versions_from_api() -> list[tuple[str, str]] | None: if not page_tags: break for tag in page_tags: - stable_tag = _parse_stable_version(tag.get("name", "")) + stable_tag = parse_stable_boost_release_tag( + tag.get("name", ""), MIN_BOOST_VERSION + ) if not stable_tag: continue tag_commit = tag.get("commit") or {} diff --git a/boost_library_usage_dashboard/utils.py b/boost_library_usage_dashboard/utils.py index e2c2fd8..8ce8167 100644 --- a/boost_library_usage_dashboard/utils.py +++ b/boost_library_usage_dashboard/utils.py @@ -1,29 +1,19 @@ import re +from core.utils.boost_version_operations import ( + loose_version_tuple, + normalize_boost_version_string, +) + def _version_tuple(version: str) -> tuple[int, int, int]: """Parse version string (e.g. '1.84.0', 'release-2.1.9-extra') to (major, minor, patch) for sorting.""" - if not version: - return (0, 0, 0) - parts = version.strip().split(".") - out: list[int] = [] - for part in parts[:3]: - number = "".join(c for c in part if c.isdigit()) - out.append(int(number) if number else 0) - while len(out) < 3: - out.append(0) - return tuple(out[:3]) + return loose_version_tuple(version) def normalize_version_str(version_str: str) -> str | None: """Normalize a version string for comparison; returns None if invalid or pre-1.0.""" - version = (version_str or "").strip().replace("boost-", "") - version = version.replace("-", ".").replace("_", ".") - if not version or version.startswith("0."): - return None - if len(version.split(".")) == 2: - version = f"{version}.0" - return version + return normalize_boost_version_string(version_str) def format_percent(current: int, total: int) -> str: diff --git a/boost_usage_tracker/boost_searcher.py b/boost_usage_tracker/boost_searcher.py index 457fa48..e9ec560 100644 --- a/boost_usage_tracker/boost_searcher.py +++ b/boost_usage_tracker/boost_searcher.py @@ -23,6 +23,11 @@ from datetime import datetime from typing import Any, Optional +from core.utils.boost_version_operations import ( + decode_boost_version, + normalize_boost_version_string, +) + from github_ops.client import GitHubAPIClient logger = logging.getLogger(__name__) @@ -100,15 +105,6 @@ def extract_boost_includes(content: str) -> list[str]: ] -def _normalize_version(version_str: str) -> Optional[str]: - version = version_str.replace("-", ".").replace("_", ".") - if version.startswith("0."): - return None - if len(version.split(".")) == 2: - version = f"{version}.0" - return version - - def extract_boost_version_from_content( content: str, filename: str, @@ -122,28 +118,26 @@ def extract_boost_version_from_content( match = BOOST_VERSION_HPP_PATTERN.search(content) if match: ver_int = int(match.group(1)) - major = ver_int // 100_000 - minor = (ver_int // 100) % 1_000 - patch = ver_int % 100 + major, minor, patch = decode_boost_version(ver_int) return f"{major}.{minor}.{patch}" if lower in ("cmakelists.txt", "cmakelists.cmake"): for pat in CMAKE_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) if lower in ("conanfile.txt", "conanfile.py"): for pat in CONAN_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) if lower == "vcpkg.json": for pat in VCPKG_VERSION_PATTERNS: match = pat.search(content) if match: - return _normalize_version(match.group(1).strip()) + return normalize_boost_version_string(match.group(1).strip()) return None diff --git a/core/tests/test_boost_version_operations.py b/core/tests/test_boost_version_operations.py new file mode 100644 index 0000000..5f31b8d --- /dev/null +++ b/core/tests/test_boost_version_operations.py @@ -0,0 +1,78 @@ +"""Tests for core.utils.boost_version_operations.""" + +import pytest + +from core.utils.boost_version_operations import ( + compare_boost_version_tuples, + compare_encoded_versions, + compare_loose_version_strings, + decode_boost_version, + encode_boost_version, + encode_boost_version_string, + loose_version_tuple, + normalize_boost_version_string, + parse_boost_version_string, + parse_stable_boost_release_tag, +) + + +def test_encode_decode_round_trip(): + assert encode_boost_version(1, 86, 0) == 108_600 + assert decode_boost_version(108_600) == (1, 86, 0) + assert decode_boost_version(1_00_900) == (1, 9, 0) + + +def test_encode_boost_version_string(): + assert encode_boost_version_string("1.86.0") == 108_600 + assert encode_boost_version_string("boost-1.10.0") == 101_000 + assert encode_boost_version_string("1_56_0") == 105_600 + + +def test_parse_invalid_returns_none(): + assert parse_boost_version_string("") is None + assert parse_boost_version_string("not-a-version") is None + + +def test_encode_rejects_out_of_range(): + with pytest.raises(ValueError): + encode_boost_version(1, 1000, 0) + with pytest.raises(ValueError): + encode_boost_version(1, 0, 100) + + +def test_loose_version_tuple_empty_and_digits(): + assert loose_version_tuple("") == (0, 0, 0) + assert loose_version_tuple("1.82.x") == (1, 82, 0) + assert loose_version_tuple("release-2.1.9-extra") == (2, 1, 9) + + +def test_normalize_boost_version_string(): + assert normalize_boost_version_string("1.82") == "1.82.0" + assert normalize_boost_version_string("0.99") is None + assert normalize_boost_version_string("") is None + assert normalize_boost_version_string("boost-1.2.3") == "1.2.3" + + +def test_compare_boost_version_tuples(): + assert compare_boost_version_tuples((1, 0, 0), (2, 0, 0)) == -1 + assert compare_boost_version_tuples((1, 82, 0), (1, 82, 0)) == 0 + assert compare_boost_version_tuples((2, 0, 0), (1, 99, 99)) == 1 + + +def test_compare_loose_version_strings(): + assert compare_loose_version_strings("1.0", "2.0") == -1 + assert compare_loose_version_strings("1.82.x", "1.81.0") == 1 + + +def test_compare_encoded_versions(): + assert compare_encoded_versions(100_000, 200_000) == -1 + assert compare_encoded_versions(108_600, 108_600) == 0 + + +def test_parse_stable_boost_release_tag(): + min_v = (1, 16, 1) + assert parse_stable_boost_release_tag("boost-1.90.0", min_v) == "boost-1.90.0" + assert parse_stable_boost_release_tag("boost-1.16.1", min_v) == "boost-1.16.1" + assert parse_stable_boost_release_tag("boost-1.16.0", min_v) is None + assert parse_stable_boost_release_tag("boost-1.90.0-beta", min_v) is None + assert parse_stable_boost_release_tag("", min_v) is None diff --git a/core/utils/boost_version_operations.py b/core/utils/boost_version_operations.py new file mode 100644 index 0000000..7383590 --- /dev/null +++ b/core/utils/boost_version_operations.py @@ -0,0 +1,197 @@ +""" +Boost release version helpers: macro packing, strict parse for Pinecone keys, +loose parse for sorting messy strings, normalization, and comparisons. + +**Strict (``BOOST_VERSION`` macro / Pinecone metadata keys)** — numeric packing:: + + major * 100_000 + minor * 100 + patch + +Requires ``minor <= 999`` and ``patch <= 99`` for collision-free encoding. +Use :func:`parse_boost_version_string` and :func:`encode_boost_version_string`. + +**Loose (sorting / analytics)** — digit runs per dot-separated segment; empty +input → ``(0, 0, 0)``. Handles strings like ``release-2.1.9-extra``. Use +:func:`loose_version_tuple` / :func:`compare_loose_version_strings`. + +**GitHub stable tags** — exact ``boost-X.Y.Z`` (no ``-beta`` / ``-rc`` suffix). +Use :func:`parse_stable_boost_release_tag` with a caller-supplied minimum tuple. +""" + +from __future__ import annotations + +import re + +# --- Macro packing (BOOST_VERSION / version.hpp) -------------------------------- + +MAJOR_MULTIPLIER = 100_000 +MINOR_MULTIPLIER = 100 + +_MAX_MINOR = 999 +_MAX_PATCH = 99 + +_VERSION_STRIP_PREFIX = re.compile(r"^boost[-_]", re.IGNORECASE) + + +def encode_boost_version(major: int, minor: int, patch: int) -> int: + """Return the packed integer (``major * 100_000 + minor * 100 + patch``).""" + if major < 0 or minor < 0 or patch < 0: + raise ValueError( + f"Version components must be non-negative, got {major}.{minor}.{patch}" + ) + if minor > _MAX_MINOR or patch > _MAX_PATCH: + raise ValueError( + f"Encoding requires minor <= {_MAX_MINOR} and patch <= {_MAX_PATCH} " + f"(got {major}.{minor}.{patch})" + ) + return major * MAJOR_MULTIPLIER + minor * MINOR_MULTIPLIER + patch + + +def decode_boost_version(encoded: int) -> tuple[int, int, int]: + """Split a packed ``BOOST_VERSION``-style integer into (major, minor, patch).""" + if encoded < 0: + raise ValueError(f"encoded version must be non-negative, got {encoded}") + major = encoded // MAJOR_MULTIPLIER + minor = (encoded // MINOR_MULTIPLIER) % 1000 + patch = encoded % MINOR_MULTIPLIER + return major, minor, patch + + +def parse_boost_version_string(version_str: str) -> tuple[int, int, int] | None: + """ + Parse ``1.86.0``, ``boost-1.86.0``, or ``1_86_0`` into (major, minor, patch). + + Missing minor/patch segments default to 0. Returns None if unparseable or + out of encodable range. + """ + if not version_str or not str(version_str).strip(): + return None + s = _VERSION_STRIP_PREFIX.sub("", str(version_str).strip()) + s = s.replace("_", ".") + parts = s.split(".") + if not parts or not parts[0].strip(): + return None + try: + major = int(parts[0].strip()) + minor = int(parts[1].strip()) if len(parts) > 1 else 0 + patch = int(parts[2].strip()) if len(parts) > 2 else 0 + except ValueError: + return None + if minor > _MAX_MINOR or patch > _MAX_PATCH: + return None + if major < 0 or minor < 0 or patch < 0: + return None + return major, minor, patch + + +def encode_boost_version_string(version_str: str) -> int | None: + """Parse *version_str* and return the packed int, or None if invalid.""" + triple = parse_boost_version_string(version_str) + if triple is None: + return None + major, minor, patch = triple + try: + return encode_boost_version(major, minor, patch) + except ValueError: + return None + + +# --- Loose tuple (sorting / dirty strings) ------------------------------------ + + +def loose_version_tuple(version: str) -> tuple[int, int, int]: + """ + Parse *version* to (major, minor, patch) for sorting. + + Each segment uses the longest digit run only (e.g. ``1.82.x`` → ``(1, 82, 0)``). + Empty string → ``(0, 0, 0)``. + """ + if not version: + return (0, 0, 0) + parts = version.strip().split(".") + out: list[int] = [] + for part in parts[:3]: + number = "".join(c for c in part if c.isdigit()) + out.append(int(number) if number else 0) + while len(out) < 3: + out.append(0) + return tuple(out[:3]) + + +# --- Normalization ------------------------------------------------------------ + + +def normalize_boost_version_string(version_str: str) -> str | None: + """ + Normalize a version string for comparison; returns None if invalid or pre-1.0. + + Strips ``boost-`` prefix, maps ``-`` / ``_`` to ``.``, appends ``.0`` when + only two segments are present. + """ + version = (version_str or "").strip().replace("boost-", "") + version = version.replace("-", ".").replace("_", ".") + if not version or version.startswith("0."): + return None + if len(version.split(".")) == 2: + version = f"{version}.0" + return version + + +# --- Comparison --------------------------------------------------------------- + + +def compare_boost_version_tuples( + a: tuple[int, int, int], b: tuple[int, int, int] +) -> int: + """Return -1 if a < b, 0 if equal, 1 if a > b.""" + if a < b: + return -1 + if a > b: + return 1 + return 0 + + +def compare_loose_version_strings(left: str, right: str) -> int: + """Compare two version strings using :func:`loose_version_tuple`.""" + return compare_boost_version_tuples( + loose_version_tuple(left), loose_version_tuple(right) + ) + + +def compare_encoded_versions(i: int, j: int) -> int: + """ + Compare two packed ints from :func:`encode_boost_version`. + + Do not use for arbitrary integers that were not produced by that encoding. + """ + if i < j: + return -1 + if i > j: + return 1 + return 0 + + +# --- GitHub stable release tags (boostorg/boost) -------------------------------- + +BOOST_STABLE_RELEASE_TAG_PATTERN = re.compile(r"^boost-(\d+)\.(\d+)\.(\d+)$") + + +def parse_stable_boost_release_tag( + tag_name: str, + min_version: tuple[int, int, int], +) -> str | None: + """ + If *tag_name* matches ``boost-X.Y.Z`` (three numeric parts only) and the + version is >= *min_version*, return the canonical tag (e.g. ``boost-1.90.0``). + + Returns ``None`` for empty names, non-matching patterns, or versions below + *min_version*. + """ + if not tag_name: + return None + m = BOOST_STABLE_RELEASE_TAG_PATTERN.match(tag_name.strip()) + if not m: + return None + major, minor, patch = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if compare_boost_version_tuples((major, minor, patch), min_version) == -1: + return None + return f"boost-{major}.{minor}.{patch}" diff --git a/docs/service_api/boost_library_docs_tracker.md b/docs/service_api/boost_library_docs_tracker.md index 6fadb7c..9e89a2a 100644 --- a/docs/service_api/boost_library_docs_tracker.md +++ b/docs/service_api/boost_library_docs_tracker.md @@ -9,31 +9,30 @@ ## BoostDocContent -| Function | Parameter types | Return type | Notes | -|---|---|---|---| -| `get_or_create_doc_content` | `url: str`, `page_content: str`, `content_hash: str` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | +| Function | Parameter types | Return type | Notes | +| --------------------------- | ----------------------------------------------------------------- | ----------------------------- | -------------------------------------------------------- | +| `get_or_create_doc_content` | `url: str`, `content_hash: str`, `version_id: int \| None = None` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | ### `get_or_create_doc_content` return values The second element is a `str` indicating what changed: -| `change_type` | Condition | Side effects | -|---|---|---| -| `"created"` | URL not in DB | Inserts row with `page_content`, `content_hash`, `scraped_at=now()`. | -| `"content_changed"` | URL exists; `content_hash` differs | Updates `page_content`, `content_hash`, `scraped_at=now()`. | -| `"unchanged"` | URL exists; `content_hash` same | Updates `scraped_at=now()` only. | +| `change_type` | Condition | Side effects | +| ------------- | ----------------------------- | ------------------------------------------------------------------------------- | +| `"created"` | `content_hash` not in DB | Inserts row with `url`, `content_hash`, `scraped_at=now()`. | +| `"unchanged"` | `content_hash` already exists | Updates `scraped_at`, and may update `url` and version FKs; same hash identity. | --- ## BoostLibraryDocumentation -| Function | Parameter types | Return type | Notes | -|---|---|---|---| -| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int`, `page_count: int` | `tuple[BoostLibraryDocumentation, bool]` | Get or create a row for the (library_version, doc_content) pair. Sets `page_count`. If exists, updates `page_count` if changed. | -| `mark_relation_running` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="running"`, `updated_at=now()`. | -| `mark_relation_completed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="completed"`, `updated_at=now()`. | -| `mark_relation_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | -| `get_pending_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | Returns all rows for this library-version where `status != "completed"`. Empty queryset means the library-version is fully done (skip on restart). | -| `get_docs_pending_sync` | — | `QuerySet[BoostLibraryDocumentation]` | Returns all rows where `status in ("pending", "failed")`. Used by the Pinecone sync step. | -| `mark_doc_synced` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="synced"` (or equivalent completed sync state), `updated_at=now()`. | -| `mark_doc_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | +| Function | Parameter types | Return type | Notes | +| -------------------------------------- | ------------------------------------------------------------------- | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int`, `page_count: int` | `tuple[BoostLibraryDocumentation, bool]` | Get or create a row for the (library_version, doc_content) pair. Sets `page_count`. If exists, updates `page_count` if changed. | +| `mark_relation_running` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="running"`, `updated_at=now()`. | +| `mark_relation_completed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="completed"`, `updated_at=now()`. | +| `mark_relation_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | +| `get_pending_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | Returns all rows for this library-version where `status != "completed"`. Empty queryset means the library-version is fully done (skip on restart). | +| `get_docs_pending_sync` | — | `QuerySet[BoostLibraryDocumentation]` | Returns all rows where `status in ("pending", "failed")`. Used by the Pinecone sync step. | +| `mark_doc_synced` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="synced"` (or equivalent completed sync state), `updated_at=now()`. | +| `mark_doc_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | From 4e7b433fbf048d308d09415198a370ff07217da7 Mon Sep 17 00:00:00 2001 From: zho Date: Fri, 27 Mar 2026 01:17:15 +0800 Subject: [PATCH 38/76] #126-update docs/service_api/boost_library_docs_tracker.md --- .../service_api/boost_library_docs_tracker.md | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/service_api/boost_library_docs_tracker.md b/docs/service_api/boost_library_docs_tracker.md index 9e89a2a..7ad8f4b 100644 --- a/docs/service_api/boost_library_docs_tracker.md +++ b/docs/service_api/boost_library_docs_tracker.md @@ -5,13 +5,18 @@ **Type notation:** `BoostDocContent` and `BoostLibraryDocumentation` are from `boost_library_docs_tracker.models`. `BoostLibraryVersion` is from `boost_library_tracker.models` (read-only cross-app reference). +**Pinecone upsert state** is stored on `BoostDocContent.is_upserted`, not on `BoostLibraryDocumentation` (the join table has only the two FKs plus `created_at`). + --- ## BoostDocContent -| Function | Parameter types | Return type | Notes | -| --------------------------- | ----------------------------------------------------------------- | ----------------------------- | -------------------------------------------------------- | -| `get_or_create_doc_content` | `url: str`, `content_hash: str`, `version_id: int \| None = None` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | +| Function | Parameter types | Return type | Notes | +| -------------------------------- | ------------------------------------------------------------------- | ----------------------------- | --------------------------------------------------------------------- | +| `get_or_create_doc_content` | `url: str`, `content_hash: str`, `version_id: int \| None = None` | `tuple[BoostDocContent, str]` | See return values below. `ValueError` if `url` is empty. | +| `set_doc_content_upserted` | `doc: BoostDocContent`, `value: bool` | `BoostDocContent` | Sets `is_upserted`. | +| `set_doc_content_upserted_by_ids`| `ids: list[int]`, `value: bool` | `int` | Bulk `UPDATE`; returns number of rows updated. | +| `get_unupserted_doc_contents` | — | `QuerySet[BoostDocContent]` | `is_upserted=False`; used for Pinecone sync worklists. | ### `get_or_create_doc_content` return values @@ -19,20 +24,16 @@ The second element is a `str` indicating what changed: | `change_type` | Condition | Side effects | | ------------- | ----------------------------- | ------------------------------------------------------------------------------- | -| `"created"` | `content_hash` not in DB | Inserts row with `url`, `content_hash`, `scraped_at=now()`. | +| `"created"` | `content_hash` not in DB | Inserts row with `url`, `content_hash`, `scraped_at=now()`, `is_upserted=False`. May set `first_version_id` / `last_version_id` when `version_id` is passed. | | `"unchanged"` | `content_hash` already exists | Updates `scraped_at`, and may update `url` and version FKs; same hash identity. | --- ## BoostLibraryDocumentation -| Function | Parameter types | Return type | Notes | -| -------------------------------------- | ------------------------------------------------------------------- | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int`, `page_count: int` | `tuple[BoostLibraryDocumentation, bool]` | Get or create a row for the (library_version, doc_content) pair. Sets `page_count`. If exists, updates `page_count` if changed. | -| `mark_relation_running` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="running"`, `updated_at=now()`. | -| `mark_relation_completed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="completed"`, `updated_at=now()`. | -| `mark_relation_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | -| `get_pending_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | Returns all rows for this library-version where `status != "completed"`. Empty queryset means the library-version is fully done (skip on restart). | -| `get_docs_pending_sync` | — | `QuerySet[BoostLibraryDocumentation]` | Returns all rows where `status in ("pending", "failed")`. Used by the Pinecone sync step. | -| `mark_doc_synced` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="synced"` (or equivalent completed sync state), `updated_at=now()`. | -| `mark_doc_failed` | `doc: BoostLibraryDocumentation` | `BoostLibraryDocumentation` | Sets `status="failed"`, `updated_at=now()`. | +Join table: one row per `(boost_library_version, boost_doc_content)` pair. **No** `page_count`, status fields, or `updated_at` on the model. + +| Function | Parameter types | Return type | Notes | +| --------------------------------- | ---------------------------------------------------- | ---------------------------------------- | --------------------------------------------------------------------- | +| `link_content_to_library_version` | `library_version_id: int`, `doc_content_id: int` | `tuple[BoostLibraryDocumentation, bool]` | `get_or_create` on the pair. Second value is `created`. | +| `get_docs_for_library_version` | `library_version_id: int` | `QuerySet[BoostLibraryDocumentation]` | All join rows for that library version. | From 5f61c33ca85a6972dec4692d56680a477c2804b9 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Thu, 26 Mar 2026 14:34:02 -0400 Subject: [PATCH 39/76] Refactor fetch_issues_and_prs_from_github to separate the ETag/params list phase from Link pagination and clarify the docstring - #125 --- config/settings.py | 11 +++ github_activity_tracker/fetcher.py | 140 ++++++++++++++++++---------- github_activity_tracker/services.py | 1 - 3 files changed, 100 insertions(+), 52 deletions(-) diff --git a/config/settings.py b/config/settings.py index 9e0419a..ed9c9ff 100644 --- a/config/settings.py +++ b/config/settings.py @@ -583,3 +583,14 @@ def _slack_team_scope_from_env(): "level": "ERROR", } LOGGING["root"]["handlers"].append("slack") + +# You can add your own Django apps here by adding them to the EXTRA_INSTALLED_APPS list in config/local_settings.py. +try: + from . import local_settings as _local_settings + + _LOCAL_EXTRA_INSTALLED_APPS = tuple( + getattr(_local_settings, "EXTRA_INSTALLED_APPS", ()) + ) +except ImportError: + _LOCAL_EXTRA_INSTALLED_APPS = () +INSTALLED_APPS = [*INSTALLED_APPS, *_LOCAL_EXTRA_INSTALLED_APPS] diff --git a/github_activity_tracker/fetcher.py b/github_activity_tracker/fetcher.py index 71d3490..23ffdec 100644 --- a/github_activity_tracker/fetcher.py +++ b/github_activity_tracker/fetcher.py @@ -376,8 +376,14 @@ def fetch_issues_and_prs_from_github( - PRs → yield {"pr_info": , "comments": [...], "reviews": [...]} Uses Link-header pagination (direction=asc, sort=updated) so items are processed - oldest-updated-first. If etag_cache is provided, uses conditional GET for the first - page; a 304 means nothing has changed and the function returns immediately. + oldest-updated-first. + + When etag_cache is provided, list requests built from query params use conditional + GET (If-None-Match); ETags are keyed by list type, page, and since_iso in the cache. + A 304 response has no JSON for that page; pagination may continue by advancing + ``page`` while still on the params path, or by following ``Link`` after a 200. + + Requests made via full ``next`` URLs (``rest_request_url``) do not use the ETag cache. """ logger.debug( "Fetching issues+PRs for %s/%s from %s to %s", owner, repo, start_time, end_time @@ -388,56 +394,19 @@ def fetch_issues_and_prs_from_github( next_url: Optional[str] = None page_num = 1 - while True: - response_etag: Optional[str] = None - try: - if next_url is not None: - items, next_url = client.rest_request_url(next_url) - page_num += 1 - else: - params: dict = { - "state": "all", - "per_page": per_page, - "page": page_num, - "sort": "updated", - "direction": "asc", - } - if start_time: - params["since"] = start_time.isoformat() - if etag_cache is not None: - etag = etag_cache.get("issues_and_prs", page_num, since_iso, "") - data, response_etag, next_url = ( - client.rest_request_conditional_with_link( - endpoint, params=params, etag=etag - ) - ) - if data is None: - logger.debug( - "Issues+PRs list page %s: 304 Not Modified, skipping", - page_num, - ) - page_num += 1 - time.sleep(0.2) - continue - items = data - else: - items, next_url = client.rest_request_with_link(endpoint, params) - except requests.exceptions.HTTPError as e: - if e.response is not None and e.response.status_code == 422: - logger.debug( - "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" - ) - break - raise - - if not items: - logger.debug("No more issues/PRs found") - break - - logger.debug( - "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num - ) + def _issues_list_params(page: int) -> dict: + params: dict = { + "state": "all", + "per_page": per_page, + "page": page, + "sort": "updated", + "direction": "asc", + } + if start_time: + params["since"] = start_time.isoformat() + return params + def _yield_issue_pr_items_for_list_page(items: list) -> Iterator[dict]: for item in items: updated_str = item.get("updated_at") or item.get("created_at") if not updated_str: @@ -493,10 +462,79 @@ def fetch_issues_and_prs_from_github( logger.debug("Found %d comments for issue #%s", len(comments), number) yield {"issue_info": item, "comments": comments} + # Phase 1: params-based list requests (optional conditional GET + ETag cache). + while next_url is None: + response_etag: Optional[str] = None + try: + params = _issues_list_params(page_num) + if etag_cache is not None: + etag = etag_cache.get("issues_and_prs", page_num, since_iso, "") + data, response_etag, next_url = ( + client.rest_request_conditional_with_link( + endpoint, params=params, etag=etag + ) + ) + if data is None: + logger.debug( + "Issues+PRs list page %s: 304 Not Modified, skipping", + page_num, + ) + page_num += 1 + time.sleep(0.2) + continue + items = data + else: + items, next_url = client.rest_request_with_link(endpoint, params) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code == 422: + logger.debug( + "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" + ) + return + raise + + if not items: + logger.debug("No more issues/PRs found") + break + + logger.debug( + "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num + ) + + yield from _yield_issue_pr_items_for_list_page(items) + if etag_cache is not None and response_etag: etag_cache.set("issues_and_prs", page_num, since_iso, "", response_etag) if next_url is None: logger.debug('Last page reached (no Link rel="next")') break + break + + # Phase 2: follow Link rel="next" URLs (full GET; no ETag cache). + while next_url: time.sleep(0.2) + try: + items, next_url = client.rest_request_url(next_url) + except requests.exceptions.HTTPError as e: + if e.response is not None and e.response.status_code == 422: + logger.debug( + "Issues+PRs list: 422 Unprocessable Entity, stopping pagination" + ) + return + raise + page_num += 1 + + if not items: + logger.debug("No more issues/PRs found") + break + + logger.debug( + "Fetched %d items (issues+PRs combined) from page %s", len(items), page_num + ) + + yield from _yield_issue_pr_items_for_list_page(items) + + if next_url is None: + logger.debug('Last page reached (no Link rel="next")') + break diff --git a/github_activity_tracker/services.py b/github_activity_tracker/services.py index 041be69..7a5e651 100644 --- a/github_activity_tracker/services.py +++ b/github_activity_tracker/services.py @@ -196,7 +196,6 @@ def create_or_update_commit( commit_at: Optional[datetime] = None, ) -> tuple[GitCommit, bool]: """Create or update a GitCommit by repo + commit_hash. Returns (commit, created).""" - from datetime import datetime if not commit_at: commit_at = datetime.now(timezone.utc) From 7c76ff0d135a547dd39494b756ece59f77613608 Mon Sep 17 00:00:00 2001 From: zho Date: Fri, 27 Mar 2026 15:14:50 +0800 Subject: [PATCH 40/76] #126-created text_processing.py for general purpose and applied concurrent processing to the pinecone update --- boost_library_docs_tracker/html_to_md.py | 9 +- .../run_boost_library_docs_tracker.py | 2 + boost_library_docs_tracker/preprocessor.py | 3 + config/settings.py | 5 + .../utils/text_processing.py | 35 +++-- cppa_pinecone_sync/ingestion.py | 72 +++++++--- cppa_slack_tracker/preprocessor.py | 2 +- .../tests/test_text_processing.py | 135 ------------------ cppa_slack_tracker/utils/__init__.py | 9 +- 9 files changed, 97 insertions(+), 175 deletions(-) rename {cppa_slack_tracker => core}/utils/text_processing.py (85%) delete mode 100644 cppa_slack_tracker/tests/test_text_processing.py diff --git a/boost_library_docs_tracker/html_to_md.py b/boost_library_docs_tracker/html_to_md.py index 922c3fe..97862bc 100644 --- a/boost_library_docs_tracker/html_to_md.py +++ b/boost_library_docs_tracker/html_to_md.py @@ -9,7 +9,7 @@ -------- 1. _preprocess_html – remove Boost boilerplate from HTML before pandoc sees it 2. _pandoc_convert – HTML → GFM via pypandoc (CLI fallback) -3. _postprocess_markdown – strip residual HTML artefacts and rejoin split lines +3. _postprocess_markdown – strip residual HTML artefacts, rejoin split lines, then clean_text (unicode/line endings only) """ import re @@ -17,6 +17,8 @@ from bs4 import BeautifulSoup +from core.utils.text_processing import clean_text + try: import pypandoc except Exception: # optional runtime dependency @@ -299,4 +301,7 @@ def _postprocess_markdown(md: str) -> str: # 12. Collapse excessive blank lines to at most two md = _RE_EXCESS_BLANK.sub("\n\n", md) - return md.strip() + "\n" + # 13. Unicode / line-ending cleanup (no space collapsing — preserves markdown indent) + md = clean_text(md, remove_extra_spaces=False) + + return md.rstrip() + "\n" diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index 424f37a..b00239b 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -147,6 +147,8 @@ def _run( mode = "local-zip" if use_local else "HTTP crawl" self.stdout.write(f"Scrape mode: {mode}") + self._sync_pinecone() + for version in versions: self._process_version( version=version, diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index ea9402a..fb41553 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -21,6 +21,7 @@ from typing import Any from core.utils.boost_version_operations import encode_boost_version_string +from core.utils.text_processing import clean_text from .models import BoostDocContent from . import workspace @@ -163,6 +164,8 @@ def _build_documents( ) continue + page_content = clean_text(page_content, remove_extra_spaces=False) + library_name = _get_library_name(doc_content) metadata: dict[str, Any] = { diff --git a/config/settings.py b/config/settings.py index 9e0419a..e1eea83 100644 --- a/config/settings.py +++ b/config/settings.py @@ -177,6 +177,8 @@ PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() or "aws" # Chunking and batching PINECONE_BATCH_SIZE = env.int("PINECONE_BATCH_SIZE", default=96) +# Parallel threads for Pinecone metadata-only updates (update_documents); lower if you hit 429s. +PINECONE_UPDATE_MAX_WORKERS = env.int("PINECONE_UPDATE_MAX_WORKERS", default=8) PINECONE_CHUNK_SIZE = env.int("PINECONE_CHUNK_SIZE", default=1000) PINECONE_CHUNK_OVERLAP = env.int("PINECONE_CHUNK_OVERLAP", default=200) PINECONE_MIN_TEXT_LENGTH = env.int("PINECONE_MIN_TEXT_LENGTH", default=50) @@ -538,6 +540,9 @@ def _slack_team_scope_from_env(): ).strip() PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() PINECONE_BATCH_SIZE = int(env("PINECONE_BATCH_SIZE", default="96") or "96") +PINECONE_UPDATE_MAX_WORKERS = int( + env("PINECONE_UPDATE_MAX_WORKERS", default="4") or "4" +) PINECONE_CHUNK_SIZE = int(env("PINECONE_CHUNK_SIZE", default="1000") or "1000") PINECONE_CHUNK_OVERLAP = int(env("PINECONE_CHUNK_OVERLAP", default="200") or "200") PINECONE_MIN_TEXT_LENGTH = int(env("PINECONE_MIN_TEXT_LENGTH", default="50") or "50") diff --git a/cppa_slack_tracker/utils/text_processing.py b/core/utils/text_processing.py similarity index 85% rename from cppa_slack_tracker/utils/text_processing.py rename to core/utils/text_processing.py index 1dd9f25..e801d39 100644 --- a/cppa_slack_tracker/utils/text_processing.py +++ b/core/utils/text_processing.py @@ -1,16 +1,16 @@ """ -Text processing utilities for Slack message preprocessing. +Shared text cleaning and light filtering helpers. -Adapted from workspace/utility.py for Django integration. -Contains functions for cleaning, filtering, and validating Slack message content. +Used by ``cppa_slack_tracker`` (and other apps) for normalizing message text and +optional greeting/noise phrase removal. Default word lists are Slack-oriented +(``SLACK_*`` constants). """ -import re -import logging -from typing import Optional, Iterable, FrozenSet - -logger = logging.getLogger(__name__) +from __future__ import annotations +import html +import re +from typing import Iterable, FrozenSet, Optional # Default greeting/unessential words for filter_sentence (Slack message cleaning) SLACK_GREETING_WORDS: FrozenSet[str] = frozenset( @@ -89,12 +89,13 @@ ) -def clean_text(text: str, remove_extra_spaces: bool = True) -> str: +def clean_text(text: str | None, remove_extra_spaces: bool = True) -> str: """ Clean and normalize text content. - Removes invisible characters, normalizes line breaks, and optionally - removes extra whitespace. + Removes invisible characters, decodes HTML character references (e.g. + ``&``, ``'``, ``/``), fixes a few common bare entities without + ``;``, normalizes line breaks, and optionally removes extra whitespace. Args: text: Input text to clean @@ -118,18 +119,22 @@ def clean_text(text: str, remove_extra_spaces: bool = True) -> str: .replace("\u200b", "") .replace("\u200c", "") .replace("\u200d", "") + .replace("\xa0", " ") + .replace("\u2002", " ") + .replace("\u2003", " ") + .replace("\u2026", "...") + .replace("\u202f", " ") ) + text = html.unescape(text) + # Normalize line breaks text = re.sub(r"\r\n", "\n", text) # Windows line breaks text = re.sub(r"\r", "\n", text) # Old Mac line breaks if remove_extra_spaces: - # Remove multiple spaces text = re.sub(r" +", " ", text) - # Remove multiple newlines (keep max 2) text = re.sub(r"\n{3,}", "\n\n", text) - # Remove spaces at start/end of lines text = "\n".join(line.strip() for line in text.split("\n")) return text.strip() @@ -191,7 +196,7 @@ def filter_sentence( return sentence_lower.strip() -def validate_content_length(content: str, min_length: int = 50) -> bool: +def validate_content_length(content: str | None, min_length: int = 50) -> bool: """ Validate that content meets minimum length requirement. diff --git a/cppa_pinecone_sync/ingestion.py b/cppa_pinecone_sync/ingestion.py index d1ee068..6e13b8c 100644 --- a/cppa_pinecone_sync/ingestion.py +++ b/cppa_pinecone_sync/ingestion.py @@ -14,6 +14,7 @@ import hashlib import logging import re +from concurrent.futures import ThreadPoolExecutor, as_completed from enum import Enum from typing import Any, Optional @@ -73,16 +74,22 @@ def __init__(self, instance: PineconeInstance = PineconeInstance.PUBLIC) -> None self.sparse_model: str = getattr( settings, "PINECONE_SPARSE_MODEL", "pinecone-sparse-english-v0" ) + # Parallel metadata updates (update_documents); 1 = sequential. Cap with Pinecone rate limits. + self.update_max_workers: int = max( + 1, int(getattr(settings, "PINECONE_UPDATE_MAX_WORKERS", 8)) + ) self._setup_client() self._initialize_text_splitter() self._setup_indexes() logger.info( - "PineconeIngestion: dense_model=%s, sparse_model=%s, instance=%s", + "PineconeIngestion: dense_model=%s, sparse_model=%s, instance=%s, " + "update_max_workers=%d", self.dense_model, self.sparse_model, self.instance.value, + self.update_max_workers, ) @property @@ -479,24 +486,51 @@ def _update_all_batches( continue batch_failed_count = 0 - for update in batch_updates: - try: - self._update_single_record(update, namespace) - updated_count += 1 - except Exception as e: - error_msg = ( - f"Error updating metadata for batch {batch_num} " - f"record {update['id']}: {e}" - ) - logger.error(error_msg) - errors.append(error_msg) - failed_docs.append( - { - "ids": update.get("ids", ""), - "reason": f"Metadata update failed: {e}", - } - ) - batch_failed_count += 1 + if self.update_max_workers <= 1: + for update in batch_updates: + try: + self._update_single_record(update, namespace) + updated_count += 1 + except Exception as e: + error_msg = ( + f"Error updating metadata for batch {batch_num} " + f"record {update['id']}: {e}" + ) + logger.error(error_msg) + errors.append(error_msg) + failed_docs.append( + { + "ids": update.get("ids", ""), + "reason": f"Metadata update failed: {e}", + } + ) + batch_failed_count += 1 + else: + max_workers = min(self.update_max_workers, len(batch_updates)) + with ThreadPoolExecutor(max_workers=max_workers) as pool: + future_to_update = { + pool.submit(self._update_single_record, u, namespace): u + for u in batch_updates + } + for fut in as_completed(future_to_update): + update = future_to_update[fut] + try: + fut.result() + updated_count += 1 + except Exception as e: + error_msg = ( + f"Error updating metadata for batch {batch_num} " + f"record {update['id']}: {e}" + ) + logger.error(error_msg) + errors.append(error_msg) + failed_docs.append( + { + "ids": update.get("ids", ""), + "reason": f"Metadata update failed: {e}", + } + ) + batch_failed_count += 1 logger.info( "Updated metadata for batch %d: %d/%d documents", diff --git a/cppa_slack_tracker/preprocessor.py b/cppa_slack_tracker/preprocessor.py index b2f0e05..b22c44e 100644 --- a/cppa_slack_tracker/preprocessor.py +++ b/cppa_slack_tracker/preprocessor.py @@ -22,7 +22,7 @@ from django.db.models import Q from cppa_slack_tracker.models import SlackMessage -from cppa_slack_tracker.utils.text_processing import ( +from cppa_slack_tracker.utils import ( clean_text, filter_sentence, validate_content_length, diff --git a/cppa_slack_tracker/tests/test_text_processing.py b/cppa_slack_tracker/tests/test_text_processing.py deleted file mode 100644 index d0cd44f..0000000 --- a/cppa_slack_tracker/tests/test_text_processing.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Tests for cppa_slack_tracker.utils.text_processing.""" - -from cppa_slack_tracker.utils.text_processing import ( - clean_text, - filter_sentence, - validate_content_length, - SLACK_GREETING_WORDS, - SLACK_UNESSENTIAL_WORDS, -) - - -def test_clean_text_removes_invisible_characters(): - """clean_text removes soft hyphens and zero-width spaces.""" - text = "Hello\xadworld\u200b" - result = clean_text(text) - assert result == "Helloworld" - - -def test_clean_text_normalizes_line_breaks(): - """clean_text normalizes different line break styles.""" - text = "Line1\r\nLine2\rLine3\nLine4" - result = clean_text(text) - assert "\r" not in result - assert result.count("\n") == 3 - - -def test_clean_text_removes_extra_spaces(): - """clean_text removes multiple spaces when remove_extra_spaces=True.""" - text = "Hello world test" - result = clean_text(text, remove_extra_spaces=True) - assert result == "Hello world test" - - -def test_clean_text_limits_newlines(): - """clean_text limits consecutive newlines to max 2.""" - text = "Line1\n\n\n\n\nLine2" - result = clean_text(text, remove_extra_spaces=True) - assert result == "Line1\n\nLine2" - - -def test_clean_text_strips_line_whitespace(): - """clean_text removes spaces at start/end of lines.""" - text = " Line1 \n Line2 " - result = clean_text(text, remove_extra_spaces=True) - assert result == "Line1\nLine2" - - -def test_clean_text_handles_empty_input(): - """clean_text returns empty string for empty input.""" - assert clean_text("") == "" - assert clean_text(None) == "" - - -def test_filter_sentence_removes_greetings(): - """filter_sentence removes greeting words as whole phrases (keeps 'hi' inside 'this').""" - sentence = "Hi there, can you help me with this?" - result = filter_sentence(sentence) - assert result.startswith("there") # standalone "Hi" removed - assert "help" in result - assert "this" in result # "hi" inside "this" is not removed - - -def test_filter_sentence_removes_unessential_words(): - """filter_sentence removes unessential words like 'ok', 'lol'.""" - sentence = "Ok sure, that sounds great lol" - result = filter_sentence(sentence) - # After filtering, should have remaining meaningful content - assert isinstance(result, str) - - -def test_filter_sentence_returns_empty_for_short_result(): - """filter_sentence returns empty string if result is too short.""" - sentence = "Hi ok" # Only greeting and unessential words - result = filter_sentence(sentence, min_words_after=3) - assert result == "" - - -def test_filter_sentence_handles_empty_input(): - """filter_sentence returns empty string for empty input.""" - assert filter_sentence("") == "" - assert filter_sentence(" ") == "" - - -def test_filter_sentence_custom_word_lists(): - """filter_sentence accepts custom greeting and unessential word lists.""" - sentence = "Hello world test example" - result = filter_sentence( - sentence, - greeting_words=["hello"], - unessential_words=["world"], - min_words_after=1, - ) - assert "test" in result or "example" in result - assert "hello" not in result.lower() - assert "world" not in result.lower() - - -def test_validate_content_length_accepts_long_text(): - """validate_content_length returns True for text meeting minimum length.""" - long_text = "This is a much longer text that definitely exceeds the minimum length requirement" - assert validate_content_length(long_text, min_length=50) is True - - -def test_validate_content_length_rejects_short_text(): - """validate_content_length returns False for text below minimum length.""" - short_text = "Hi" - assert validate_content_length(short_text, min_length=50) is False - - -def test_validate_content_length_handles_empty_input(): - """validate_content_length returns False for empty input.""" - assert validate_content_length("") is False - assert validate_content_length(None) is False - - -def test_validate_content_length_strips_whitespace(): - """validate_content_length strips whitespace before checking length.""" - text_with_spaces = " Short " - assert validate_content_length(text_with_spaces, min_length=10) is False - - -def test_slack_greeting_words_constant(): - """SLACK_GREETING_WORDS contains expected greeting words.""" - assert "hi" in SLACK_GREETING_WORDS - assert "hello" in SLACK_GREETING_WORDS - assert "thanks" in SLACK_GREETING_WORDS - assert "goodbye" in SLACK_GREETING_WORDS - - -def test_slack_unessential_words_constant(): - """SLACK_UNESSENTIAL_WORDS contains expected unessential words.""" - assert "ok" in SLACK_UNESSENTIAL_WORDS - assert "lol" in SLACK_UNESSENTIAL_WORDS - assert "yeah" in SLACK_UNESSENTIAL_WORDS - assert "awesome" in SLACK_UNESSENTIAL_WORDS diff --git a/cppa_slack_tracker/utils/__init__.py b/cppa_slack_tracker/utils/__init__.py index fdec86f..808ff04 100644 --- a/cppa_slack_tracker/utils/__init__.py +++ b/cppa_slack_tracker/utils/__init__.py @@ -1,13 +1,16 @@ """ Utility functions for cppa_slack_tracker. + +Text processing lives in ``core.utils.text_processing``; re-exported here for +stable import paths (``from cppa_slack_tracker.utils import clean_text``, etc.). """ -from .text_processing import ( +from core.utils.text_processing import ( + SLACK_GREETING_WORDS, + SLACK_UNESSENTIAL_WORDS, clean_text, filter_sentence, validate_content_length, - SLACK_GREETING_WORDS, - SLACK_UNESSENTIAL_WORDS, ) __all__ = [ From 6d5cc428ea8fe40f45e93fafdb9aeb446104e7f3 Mon Sep 17 00:00:00 2001 From: zho Date: Fri, 27 Mar 2026 15:38:51 +0800 Subject: [PATCH 41/76] #126-update max concurrent number --- config/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/settings.py b/config/settings.py index e1eea83..6c9726d 100644 --- a/config/settings.py +++ b/config/settings.py @@ -541,7 +541,7 @@ def _slack_team_scope_from_env(): PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() PINECONE_BATCH_SIZE = int(env("PINECONE_BATCH_SIZE", default="96") or "96") PINECONE_UPDATE_MAX_WORKERS = int( - env("PINECONE_UPDATE_MAX_WORKERS", default="4") or "4" + env("PINECONE_UPDATE_MAX_WORKERS", default="8") or "8" ) PINECONE_CHUNK_SIZE = int(env("PINECONE_CHUNK_SIZE", default="1000") or "1000") PINECONE_CHUNK_OVERLAP = int(env("PINECONE_CHUNK_OVERLAP", default="200") or "200") From cd06037215e493fb30cb88ecbcbaf1f12a9421dd Mon Sep 17 00:00:00 2001 From: zho Date: Fri, 27 Mar 2026 19:25:41 +0800 Subject: [PATCH 42/76] #126-removed the practice code --- .../management/commands/run_boost_library_docs_tracker.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index b00239b..424f37a 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -147,8 +147,6 @@ def _run( mode = "local-zip" if use_local else "HTTP crawl" self.stdout.write(f"Scrape mode: {mode}") - self._sync_pinecone() - for version in versions: self._process_version( version=version, From 4eb5c9b73bea24bd68ecadb1fdc7a5c9b68e0439 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 27 Mar 2026 09:55:21 -0700 Subject: [PATCH 43/76] wg21 paper updates, WG21 profile test fix, revert separate test DB URL #24 --- .github/workflows/actions.yml | 1 - config/test_settings.py | 10 +++----- cppa_user_tracker/services.py | 32 +++++++----------------- cppa_user_tracker/tests/test_services.py | 17 ++++++++++--- wg21_paper_tracker/pipeline.py | 12 ++++++--- wg21_paper_tracker/services.py | 8 +++++- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index d3f1723..a811078 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -89,7 +89,6 @@ jobs: - name: Test with pytest env: DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres - TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres SECRET_KEY: for-testing-only DJANGO_SETTINGS_MODULE: config.test_settings run: | diff --git a/config/test_settings.py b/config/test_settings.py index 38aca75..b724fd9 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -7,14 +7,10 @@ from pathlib import Path from .settings import * # noqa: F401, F403 -from .settings import env -# Use SQLite in-memory for tests by default so no PostgreSQL is required. -# Set TEST_DATABASE_URL to run tests against PostgreSQL (e.g. in CI). -_test_db_url = os.environ.get("TEST_DATABASE_URL", "").strip() -if _test_db_url: - DATABASES = {"default": env.db("TEST_DATABASE_URL")} -else: +# Use SQLite in-memory for speed when DATABASE_URL not set (e.g. local pytest). +# CI can set DATABASE_URL=sqlite:///test.sqlite3 or leave unset for :memory: +if not os.environ.get("DATABASE_URL", "").strip(): DATABASES = { "default": { "ENGINE": "django.db.backends.sqlite3", diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 34da007..d35d23e 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -374,28 +374,14 @@ def get_or_create_wg21_paper_author_profile( ) ) - if not candidates: - profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) - if email_val: - add_email(profile, email_val, is_primary=True) - return profile, True - - if len(candidates) == 1: - profile = candidates[0] - if email_val and not profile.emails.filter(email=email_val).exists(): - add_email( - profile, - email_val, - is_primary=not profile.emails.filter(is_active=True).exists(), - ) - return profile, False - - # Two or more: disambiguate by email if provided + # Disambiguate by email if provided. + for p in candidates: + if email_val and p.emails.filter(email=email_val).exists(): + return p, False + elif not email_val and not p.emails.exists(): + return p, False + + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) if email_val: - for p in candidates: - if p.emails.filter(email=email_val).exists(): - return p, False - profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) add_email(profile, email_val, is_primary=True) - return profile, True - return candidates[0], False + return profile, True diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index 75775ed..0c09e0c 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -609,16 +609,25 @@ def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): @pytest.mark.django_db -def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_adds_email(): - """Existing single match gets the supplied email attached.""" +def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_creates_new_profile(): + """One name match but email not on that profile: creates a new profile with the email. + + Disambiguation only returns an existing row when the email matches or when no email + is passed and the candidate has no emails; otherwise a new profile is created. + """ existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") profile, created = services.get_or_create_wg21_paper_author_profile( display_name="Solo Author", email="solo@example.com", ) - assert created is False - assert profile.id == existing.id + assert created is True + assert profile.id != existing.id + assert profile.display_name == "Solo Author" assert profile.emails.filter(email="solo@example.com").exists() + assert ( + WG21PaperAuthorProfile.objects.filter(display_name="Solo Author").count() == 2 + ) + assert not existing.emails.filter(email="solo@example.com").exists() @pytest.mark.django_db diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index ff15f50..a6bf41b 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -136,7 +136,9 @@ def _valid_paper_entries_for_id( return valid -def _choose_best_format_entry(valid_list: list[dict[str, Any]]) -> dict[str, Any]: +def _choose_best_format_entry( + valid_list: list[dict[str, Any]], +) -> dict[str, Any]: """Pick one row by format priority (adoc first). Precondition: valid_list non-empty.""" return min( valid_list, @@ -344,10 +346,12 @@ def run_tracker_pipeline( to_mailing_date=to_mailing_date, ) } + new_mailing_dates = set(m["mailing_date"] for m in new_mailings) for current_m in all_mailings: - if current_m["mailing_date"] in retry_dates and current_m[ - "mailing_date" - ] not in [x["mailing_date"] for x in new_mailings]: + if ( + current_m["mailing_date"] in retry_dates + and current_m["mailing_date"] not in new_mailing_dates + ): new_mailings.append(current_m) # Sort chronologically (oldest to newest) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 24c7ba6..983493a 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -4,6 +4,7 @@ from __future__ import annotations +from datetime import date from typing import TYPE_CHECKING, Optional from django.db import IntegrityError, transaction @@ -40,7 +41,7 @@ def get_or_create_paper( paper_id: str, url: str, title: str, - document_date: Optional[str], + document_date: date | None, mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, @@ -139,6 +140,9 @@ def _update_paper(paper: WG21Paper) -> bool: created = False if author_names: + if not created: + for author in paper.authors.all(): + author.delete() emails = author_emails or [] for i, name in enumerate(author_names): email = emails[i] if i < len(emails) else None @@ -156,6 +160,8 @@ def get_or_create_paper_author( """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based). Updates author_order on existing link if it differs. """ + if not isinstance(author_order, int) or author_order <= 0: + raise ValueError("author_order must be a positive integer") link, link_created = WG21PaperAuthor.objects.get_or_create( paper=paper, profile=profile, From 09b623a74413f08950a993c0644ec3bb21f20222 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 28 Mar 2026 01:40:53 +0800 Subject: [PATCH 44/76] #126-updated all preprocessors to contain "source_ids" key in metadata --- boost_library_docs_tracker/preprocessor.py | 2 +- .../tests/test_preprocessor.py | 6 ++--- boost_mailing_list_tracker/preprocesser.py | 3 +-- .../tests/test_preprocesser.py | 5 ++-- cppa_pinecone_sync/ingestion.py | 10 +++---- cppa_pinecone_sync/sync.py | 10 +++++-- cppa_pinecone_sync/tests/test_sync.py | 27 +++++++++++++++++++ cppa_slack_tracker/preprocessor.py | 3 +-- cppa_slack_tracker/tests/test_preprocessor.py | 6 ++--- docs/Pinecone_preprocess_guideline.md | 18 ++++++------- .../preprocessors/github_preprocess.py | 17 +++++++----- 11 files changed, 72 insertions(+), 35 deletions(-) diff --git a/boost_library_docs_tracker/preprocessor.py b/boost_library_docs_tracker/preprocessor.py index fb41553..6389f10 100644 --- a/boost_library_docs_tracker/preprocessor.py +++ b/boost_library_docs_tracker/preprocessor.py @@ -172,7 +172,7 @@ def _build_documents( "doc_id": doc_content.content_hash, "url": doc_content.url, "library_name": library_name, - "ids": str(doc_content.pk), + "source_ids": str(doc_content.pk), } fk = encode_boost_version_string(first_version_str) if fk is not None: diff --git a/boost_library_docs_tracker/tests/test_preprocessor.py b/boost_library_docs_tracker/tests/test_preprocessor.py index 5137c4e..18e013f 100644 --- a/boost_library_docs_tracker/tests/test_preprocessor.py +++ b/boost_library_docs_tracker/tests/test_preprocessor.py @@ -80,7 +80,7 @@ def test_preprocess_metas_when_upserted_and_scraped_after_final_sync( assert chunked is False assert len(metas) == 1 assert metas[0]["metadata"]["doc_id"] == boost_doc_content.content_hash - assert metas[0]["metadata"]["ids"] == str(boost_doc_content.pk) + assert metas[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) @pytest.mark.django_db @@ -94,7 +94,7 @@ def test_preprocess_no_metas_when_final_sync_at_none(_mock_load, boost_doc_conte assert chunked is False assert metas == [] assert len(docs) == 1 - assert docs[0]["metadata"]["ids"] == str(boost_doc_content.pk) + assert docs[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) @pytest.mark.django_db @@ -136,4 +136,4 @@ def test_preprocess_meta_excludes_failed_ids(_mock_load, boost_doc_content): ) assert metas == [] assert len(docs) == 1 - assert docs[0]["metadata"]["ids"] == str(boost_doc_content.pk) + assert docs[0]["metadata"]["source_ids"] == str(boost_doc_content.pk) diff --git a/boost_mailing_list_tracker/preprocesser.py b/boost_mailing_list_tracker/preprocesser.py index 325ee9f..6f7cbbf 100644 --- a/boost_mailing_list_tracker/preprocesser.py +++ b/boost_mailing_list_tracker/preprocesser.py @@ -120,8 +120,7 @@ def preprocess_mailing_list_for_pinecone( "author": sender_name, "timestamp": safe_timestamp, "parent_id": message.parent_id or "", - # ids should reference DB row identity for sync bookkeeping. - "table_ids": message.pk, + "source_ids": str(message.pk), "list_name": message.list_name or "", } diff --git a/boost_mailing_list_tracker/tests/test_preprocesser.py b/boost_mailing_list_tracker/tests/test_preprocesser.py index fb9c11f..61b4655 100644 --- a/boost_mailing_list_tracker/tests/test_preprocesser.py +++ b/boost_mailing_list_tracker/tests/test_preprocesser.py @@ -120,7 +120,7 @@ def test_preprocesser_retries_failed_ids_even_if_old( ) assert len(docs) == 1 assert docs[0]["metadata"]["doc_id"] == "" - assert docs[0]["metadata"]["table_ids"] == retry_msg.pk + assert docs[0]["metadata"]["source_ids"] == str(retry_msg.pk) @pytest.mark.django_db @@ -180,7 +180,7 @@ def test_preprocesser_document_shape_and_metadata_fields( assert target["content"] != "" assert "metadata" in target assert target["metadata"]["doc_id"] == "" - assert target["metadata"]["table_ids"] == msg.pk + assert target["metadata"]["source_ids"] == str(msg.pk) assert target["metadata"]["type"] == "mailing" assert target["metadata"]["thread_id"] == "thread-1" assert target["metadata"]["parent_id"] == "" @@ -189,6 +189,7 @@ def test_preprocesser_document_shape_and_metadata_fields( assert target["metadata"]["list_name"] == default_list_name assert target["metadata"]["timestamp"] == int(sample_sent_at.timestamp()) assert "ids" not in target["metadata"] + assert "source_ids" in target["metadata"] assert "msg_id" not in target["metadata"] assert "source" not in target["metadata"] assert "sender_id" not in target["metadata"] diff --git a/cppa_pinecone_sync/ingestion.py b/cppa_pinecone_sync/ingestion.py index 6e13b8c..27b897d 100644 --- a/cppa_pinecone_sync/ingestion.py +++ b/cppa_pinecone_sync/ingestion.py @@ -382,7 +382,7 @@ def _prepare_batch_records( ) record: dict[str, Any] = {"id": doc_id, "chunk_text": text} record.update(metadata) - record.pop("table_ids", None) + record.pop("source_ids", None) records.append(record) return records @@ -413,7 +413,7 @@ def _mark_batch_failed( meta = doc.metadata or {} failed.append( { - "ids": meta.get("table_ids", ""), + "ids": meta.get("source_ids") or meta.get("table_ids", ""), "reason": f"Batch upsert failed: {error}", } ) @@ -561,9 +561,9 @@ def _prepare_batch_updates( record_idx=len(updates), ) - source_ids = metadata.get("table_ids", "") - metadata.pop("table_ids", None) - updates.append({"id": doc_id, "set_metadata": metadata, "ids": source_ids}) + track_ids = metadata.get("source_ids") or metadata.get("table_ids", "") + metadata.pop("source_ids", None) + updates.append({"id": doc_id, "set_metadata": metadata, "ids": track_ids}) return updates diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 5afc585..3027592 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -76,11 +76,17 @@ def _build_documents_from_raw(raw_documents: list[dict[str, Any]]) -> list[Any]: for item in raw_documents: content = item.get("content", "") metadata = dict(item.get("metadata") or {}) - ids_str = metadata.get("ids") or item.get("ids", "") or "" + ids_str = ( + metadata.get("source_ids") + or metadata.get("ids") + or item.get("source_ids", "") + or item.get("ids", "") + or "" + ) if "doc_id" not in metadata and "url" not in metadata: logger.warning( - "Skipping document with ids=%s: metadata must contain 'doc_id' or 'url'", + "Skipping document with source_ids=%s: metadata must contain 'doc_id' or 'url'", ids_str, ) continue diff --git a/cppa_pinecone_sync/tests/test_sync.py b/cppa_pinecone_sync/tests/test_sync.py index 8f1dbed..beac071 100644 --- a/cppa_pinecone_sync/tests/test_sync.py +++ b/cppa_pinecone_sync/tests/test_sync.py @@ -97,6 +97,33 @@ def test_build_documents_from_raw_mixed(): assert result[1].page_content == "c" +def test_build_documents_from_raw_metadata_source_ids(): + """metadata['source_ids'] is copied to table_ids (preferred over legacy top-level ids).""" + raw = [ + { + "content": "hello", + "metadata": {"doc_id": "doc-1", "source_ids": "42"}, + }, + ] + result = _build_documents_from_raw(raw) + assert len(result) == 1 + assert result[0].metadata.get("table_ids") == "42" + + +def test_build_documents_from_raw_source_ids_overrides_top_level_ids(): + """When both are present, metadata['source_ids'] wins for table_ids.""" + raw = [ + { + "ids": "legacy", + "content": "x", + "metadata": {"doc_id": "d", "source_ids": "from-meta"}, + }, + ] + result = _build_documents_from_raw(raw) + assert len(result) == 1 + assert result[0].metadata.get("table_ids") == "from-meta" + + # --- _extract_new_failed_ids --- diff --git a/cppa_slack_tracker/preprocessor.py b/cppa_slack_tracker/preprocessor.py index b22c44e..ce5df6c 100644 --- a/cppa_slack_tracker/preprocessor.py +++ b/cppa_slack_tracker/preprocessor.py @@ -382,8 +382,7 @@ def preprocess_slack_for_pinecone( "thread_ts": thread_ts if thread_ts else "", "group_size": len(message_ids), "team_id": team_id, - # ids should reference message timestamps for sync bookkeeping - "ids": ",".join(message_ids), + "source_ids": ",".join(message_ids), } docs.append({"content": content, "metadata": metadata}) diff --git a/cppa_slack_tracker/tests/test_preprocessor.py b/cppa_slack_tracker/tests/test_preprocessor.py index 5a20f6a..c44dcad 100644 --- a/cppa_slack_tracker/tests/test_preprocessor.py +++ b/cppa_slack_tracker/tests/test_preprocessor.py @@ -182,9 +182,9 @@ def test_preprocessor_document_shape_and_metadata_fields( assert "timestamp" in target["metadata"] assert "team_id" in target["metadata"] - # Check ids field for retry tracking - assert "ids" in target["metadata"] - assert isinstance(target["metadata"]["ids"], str) + # Check source_ids for retry tracking (Pinecone_preprocess_guideline.md) + assert "source_ids" in target["metadata"] + assert isinstance(target["metadata"]["source_ids"], str) @pytest.mark.django_db diff --git a/docs/Pinecone_preprocess_guideline.md b/docs/Pinecone_preprocess_guideline.md index ac04902..c04a2f8 100644 --- a/docs/Pinecone_preprocess_guideline.md +++ b/docs/Pinecone_preprocess_guideline.md @@ -71,7 +71,7 @@ Each item in the list must be a dict with at least: | `content` | top-level | Yes | The text to index (plain string). | | `metadata` | top-level | Yes | Dict of metadata attached to the document. | | `metadata["doc_id"]` or `metadata["url"]` | inside `metadata` | One required | Stable identifier for the document (e.g. primary key, URL). Used for chunk IDs and for skipping invalid docs. | -| `metadata["ids"]` | inside `metadata` | Recommended | Comma-separated **source record IDs** (e.g. DB primary keys). Used to record failed IDs when an upsert fails so they can be retried next run. If omitted, failed-document tracking for that item will be empty. | +| `metadata["source_ids"]` | inside `metadata` | Recommended | Comma-separated **source record IDs** (e.g. DB primary keys). Used to record failed IDs when an upsert fails so they can be retried next run. If omitted, failed-document tracking for that item will be empty. | Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed through to Pinecone and can be used for filtering or display. @@ -82,7 +82,7 @@ Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed throu "content": "The actual text to index for this document or chunk.", "metadata": { "doc_id": "slack-msg-12345", # or "url": "https://..." - "ids": "12345", # source ID(s) for retry tracking + "source_ids": "12345", # source ID(s) for retry tracking "title": "Optional title", }, } @@ -90,12 +90,12 @@ Any other keys in `metadata` (e.g. `title`, `author`, `source`) are passed throu ### Example with multiple source IDs (e.g. one chunk from multiple rows) -If one logical “document” is built from several source records, pass their IDs in `metadata["ids"]` as a comma-separated string so that if the upsert fails, all of them are recorded for retry: +If one logical “document” is built from several source records, pass their IDs in `metadata["source_ids"]` as a comma-separated string so that if the upsert fails, all of them are recorded for retry: ```python "metadata": { "doc_id": "thread-abc", - "ids": "101,102,103", + "source_ids": "101,102,103", } ``` @@ -145,10 +145,10 @@ python manage.py run_cppa_pinecone_sync \ --pinecone-instance private ``` -| Instance | Django setting read | `.env` key | -|------------|-----------------------------|-----------------------------| -| `public` | `PINECONE_API_KEY` | `PINECONE_API_KEY` | -| `private` | `PINECONE_PRIVATE_API_KEY` | `PINECONE_PRIVATE_API_KEY` | +| Instance | Django setting read | `.env` key | +| --------- | -------------------------- | -------------------------- | +| `public` | `PINECONE_API_KEY` | `PINECONE_API_KEY` | +| `private` | `PINECONE_PRIVATE_API_KEY` | `PINECONE_PRIVATE_API_KEY` | If no `instance` is specified, **public** is used. @@ -159,7 +159,7 @@ If no `instance` is specified, **public** is used. - [ ] Signature: `(failed_ids: list[str], final_sync_at: datetime | None) -> tuple[list[dict], bool]`. - [ ] Each dict has top-level `content` (str) and `metadata` (dict). - [ ] Each `metadata` has at least one of `doc_id` or `url`. -- [ ] For retry tracking, set `metadata["ids"]` to the source record ID(s), comma-separated if multiple. +- [ ] For retry tracking, set `metadata["source_ids"]` to the source record ID(s), comma-separated if multiple. - [ ] Use `failed_ids` to re-include previously failed records. - [ ] Use `final_sync_at` for incremental sync when applicable. - [ ] Return `is_chunked=True` only if you are already emitting final chunks; otherwise `False`. diff --git a/github_activity_tracker/preprocessors/github_preprocess.py b/github_activity_tracker/preprocessors/github_preprocess.py index 11643a6..8911f11 100644 --- a/github_activity_tracker/preprocessors/github_preprocess.py +++ b/github_activity_tracker/preprocessors/github_preprocess.py @@ -18,7 +18,7 @@ "content": , "metadata": { "doc_id": , - "ids": ":issue:" or ":pr:", + "source_ids": ":issue:" or ":pr:", "type": "issue" | "pr", "number": , "title": , @@ -39,7 +39,7 @@ import logging from datetime import datetime, timezone from pathlib import Path -from typing import Any, Generator +from typing import Any, Generator, Literal from operations.md_ops.issue_to_md import issue_json_to_md from operations.md_ops.pr_to_md import pr_json_to_md @@ -108,6 +108,11 @@ def _iter_json_files( yield path, data +def get_ids_for_pinecone(repo: str, type: Literal["issue", "pr"], number: int) -> str: + """Get the ids for Pinecone from a repo, type, and number.""" + return f"{repo}:{type}:{number}" + + # --------------------------------------------------------------------------- # Public iterators # --------------------------------------------------------------------------- @@ -171,7 +176,7 @@ def build_issue_document( "content": content, "metadata": { "doc_id": html_url, - "ids": f"{repo}:issue:{number}", + "source_ids": get_ids_for_pinecone(repo, "issue", number), "type": "issue", "number": number, "title": (info.get("title") or "").strip(), @@ -216,7 +221,7 @@ def build_pr_document( "content": content, "metadata": { "doc_id": html_url, - "ids": f"{repo}:pr:{number}", + "source_ids": get_ids_for_pinecone(repo, "pr", number), "type": "pr", "number": number, "title": (info.get("title") or "").strip(), @@ -264,7 +269,7 @@ def preprocess_issues( for path, data in iter_raw_issue_jsons(owner, repo): info = data.get("issue_info") or {} number = info.get("number") or -1 - ids_val = f"{repo}:issue:{number}" + ids_val = get_ids_for_pinecone(repo, "issue", number) is_failed = ids_val in failed_set updated_at = _parse_updated_at(info) @@ -322,7 +327,7 @@ def preprocess_prs( for path, data in iter_raw_pr_jsons(owner, repo): info = data.get("pr_info") or {} number = info.get("number") or -1 - ids_val = f"{repo}:pr:{number}" + ids_val = get_ids_for_pinecone(repo, "pr", number) is_failed = ids_val in failed_set updated_at = _parse_updated_at(info) From 2d8a3126b70edf461d33533b78db8f83e6ec68a9 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 28 Mar 2026 01:43:25 +0800 Subject: [PATCH 45/76] #126-rename typo of boost mailing preprocessor --- boost_mailing_list_tracker/{preprocesser.py => preprocessor.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename boost_mailing_list_tracker/{preprocesser.py => preprocessor.py} (100%) diff --git a/boost_mailing_list_tracker/preprocesser.py b/boost_mailing_list_tracker/preprocessor.py similarity index 100% rename from boost_mailing_list_tracker/preprocesser.py rename to boost_mailing_list_tracker/preprocessor.py From 026b22bb8bf477e78f8516144fdfb5abf0a99cfd Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 28 Mar 2026 02:03:13 +0800 Subject: [PATCH 46/76] #106-fixed test for renamed file --- .../management/commands/run_boost_mailing_list_tracker.py | 2 +- boost_mailing_list_tracker/tests/test_preprocesser.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py b/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py index a732eb3..5784081 100644 --- a/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py +++ b/boost_mailing_list_tracker/management/commands/run_boost_mailing_list_tracker.py @@ -61,7 +61,7 @@ def _run_pinecone_sync(app_type: str, namespace: str) -> None: "run_cppa_pinecone_sync", app_type=app_type, namespace=namespace, - preprocessor="boost_mailing_list_tracker.preprocesser.preprocess_mailing_list_for_pinecone", + preprocessor="boost_mailing_list_tracker.preprocessor.preprocess_mailing_list_for_pinecone", ) logger.info( "run_boost_mailing_list_tracker: pinecone sync completed (app_type=%s, namespace=%s)", diff --git a/boost_mailing_list_tracker/tests/test_preprocesser.py b/boost_mailing_list_tracker/tests/test_preprocesser.py index 61b4655..fff8803 100644 --- a/boost_mailing_list_tracker/tests/test_preprocesser.py +++ b/boost_mailing_list_tracker/tests/test_preprocesser.py @@ -1,11 +1,11 @@ -"""Tests for boost_mailing_list_tracker.preprocesser.""" +"""Tests for boost_mailing_list_tracker.preprocessor.""" from datetime import timedelta import pytest from django.utils import timezone -from boost_mailing_list_tracker.preprocesser import ( +from boost_mailing_list_tracker.preprocessor import ( preprocess_mailing_list_for_pinecone, ) From c993d3c19108465fdabaa6208fbe9f13c492f417 Mon Sep 17 00:00:00 2001 From: zho Date: Sat, 28 Mar 2026 03:17:40 +0800 Subject: [PATCH 47/76] #126-addressed minor errors --- .../commands/run_boost_library_docs_tracker.py | 1 - cppa_pinecone_sync/sync.py | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py index 424f37a..2221aef 100644 --- a/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py +++ b/boost_library_docs_tracker/management/commands/run_boost_library_docs_tracker.py @@ -394,7 +394,6 @@ def _sync_pinecone(self): return successful_ids = result.get("successful_source_ids", []) - failed_ids = result.get("failed_ids", []) int_successful_ids: list[int] = [] for sid in successful_ids: try: diff --git a/cppa_pinecone_sync/sync.py b/cppa_pinecone_sync/sync.py index 3027592..7bbffb7 100644 --- a/cppa_pinecone_sync/sync.py +++ b/cppa_pinecone_sync/sync.py @@ -9,7 +9,8 @@ 3. Upsert documents to Pinecone via PineconeIngestion. 4. Update the fail list and sync status in the database. -See docs/pinecone_sync.md for the full specification. +See docs/Pinecone_preprocess_guideline.md (preprocess contract) and +docs/service_api/cppa_pinecone_sync.md (fail list / sync status services). """ from __future__ import annotations @@ -237,11 +238,11 @@ def sync_to_pinecone( services.clear_failed_ids(app_type) if new_failed_ids: services.record_failed_ids(app_type, new_failed_ids) - logger.warning( - "app_type=%s: %d source IDs recorded as failed", - app_type, - len(new_failed_ids), - ) + logger.warning( + "app_type=%s: %d source IDs recorded as failed", + app_type, + len(new_failed_ids), + ) services.update_sync_status(app_type) From 9fa9a773f18e51468d9e9e381bd7847c9297ef08 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Sat, 28 Mar 2026 03:51:31 -0400 Subject: [PATCH 48/76] feat(ops): Slack/Discord startup notification after deploy health checks - #125 --- .github/workflows/deploy-script/deploy.sh | 3 + Makefile | 6 + config/settings.py | 2 + core/management/__init__.py | 0 core/management/commands/__init__.py | 0 .../commands/send_startup_notification.py | 197 ++++++++++++++++++ 6 files changed, 208 insertions(+) create mode 100644 core/management/__init__.py create mode 100644 core/management/commands/__init__.py create mode 100644 core/management/commands/send_startup_notification.py diff --git a/.github/workflows/deploy-script/deploy.sh b/.github/workflows/deploy-script/deploy.sh index 716d1c6..ea5ac39 100644 --- a/.github/workflows/deploy-script/deploy.sh +++ b/.github/workflows/deploy-script/deploy.sh @@ -60,4 +60,7 @@ until make health >/dev/null 2>&1; do done log "Stack is healthy." +log "Sending startup notification..." +DEPLOY_BRANCH="$BRANCH" make notify || log "WARNING: Startup notification failed (non-fatal)." + log "Deploy completed." diff --git a/Makefile b/Makefile index 317972d..5ead9d5 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ SHELL := /bin/bash COMPOSE := docker compose APP := web +BEAT := celery_beat MANAGE := $(COMPOSE) run --rm $(APP) python manage.py .DEFAULT_GOAL := help @@ -32,6 +33,7 @@ help: @echo " Logs & status" @echo " ps Show running containers" @echo " health Verify DB, Redis, Selenium, and Celery containers" + @echo " notify Send Slack/Discord startup notification (celery_beat; optional DEPLOY_BRANCH)" @echo " logs Follow logs for all services" @echo " logs-web Follow logs for the web service" @echo " logs-worker Follow logs for the Celery worker" @@ -101,6 +103,10 @@ health: $(COMPOSE) ps --status running celery_worker | grep -q celery_worker $(COMPOSE) ps --status running celery_beat | grep -q celery_beat +.PHONY: notify +notify: + $(COMPOSE) exec -T -e DEPLOY_BRANCH="$(DEPLOY_BRANCH)" $(BEAT) python manage.py send_startup_notification + .PHONY: logs logs: $(COMPOSE) logs -f diff --git a/config/settings.py b/config/settings.py index ed9c9ff..55d1ae2 100644 --- a/config/settings.py +++ b/config/settings.py @@ -454,6 +454,8 @@ def _slack_team_scope_from_env(): ENABLE_ERROR_NOTIFICATIONS = env.bool("ENABLE_ERROR_NOTIFICATIONS", default=False) DISCORD_WEBHOOK_URL = env("DISCORD_WEBHOOK_URL", default="") SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL", default="") +# Post to webhooks after deploy (see make notify / send_startup_notification) +ENABLE_STARTUP_NOTIFICATIONS = env.bool("ENABLE_STARTUP_NOTIFICATIONS", default=True) LOGGING = { "version": 1, diff --git a/core/management/__init__.py b/core/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/management/commands/__init__.py b/core/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/management/commands/send_startup_notification.py b/core/management/commands/send_startup_notification.py new file mode 100644 index 0000000..8e6bd54 --- /dev/null +++ b/core/management/commands/send_startup_notification.py @@ -0,0 +1,197 @@ +""" +Post deploy/startup status to Slack and Discord webhooks (DB, Celery beat schedule, workers). +Invoked after health checks via: DEPLOY_BRANCH= make notify +""" + +import json +import logging +import os +import sys +from datetime import datetime, timezone +from urllib import request +from urllib.error import URLError + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db import connection + +from celery.schedules import crontab, schedule as celery_interval_schedule + +from config.celery import app as celery_app + +logger = logging.getLogger(__name__) + +BEAT_LINES_CAP = 25 + + +def _crontab_field_to_sorted_ints(field): + if field is None: + return None + if isinstance(field, int): + return [field] + if isinstance(field, (set, frozenset)): + return sorted(field) + if hasattr(field, "__iter__") and not isinstance(field, (str, bytes)): + try: + return sorted(int(x) for x in field) + except (TypeError, ValueError): + return None + return None + + +def _crontab_is_universal_star(field): + if field is None: + return True + s = str(field).strip() + return s in ("*", "**", "None") + + +def describe_celery_schedule(sched) -> str: + if isinstance(sched, celery_interval_schedule): + run_every = getattr(sched, "run_every", None) + if run_every is not None: + minutes = int(run_every.total_seconds() // 60) + return f"every {minutes} minutes" + return repr(sched) + if isinstance(sched, crontab): + hours = _crontab_field_to_sorted_ints(sched.hour) + minutes = _crontab_field_to_sorted_ints(sched.minute) + parts = [] + if hours is not None and minutes is not None and len(hours) == 1 and len(minutes) == 1: + parts.append(f"{hours[0]:02d}:{minutes[0]:02d} UTC") + else: + parts.append(f"crontab hour={sched.hour!r} minute={sched.minute!r}") + if not _crontab_is_universal_star(getattr(sched, "day_of_week", None)): + parts.append(f"dow={sched.day_of_week!r}") + if not _crontab_is_universal_star(getattr(sched, "day_of_month", None)): + parts.append(f"dom={sched.day_of_month!r}") + if not _crontab_is_universal_star(getattr(sched, "month_of_year", None)): + parts.append(f"moy={sched.month_of_year!r}") + return " ".join(parts) + return repr(sched) + + +def collect_beat_lines(beat_schedule: dict) -> tuple[list[str], int]: + lines = [] + total = len(beat_schedule) + for name in sorted(beat_schedule.keys()): + entry = beat_schedule[name] + task = entry.get("task", "?") + sch = entry.get("schedule") + cadence = describe_celery_schedule(sch) if sch is not None else "?" + lines.append(f"- `{name}` → `{task}` @ {cadence}") + return lines, total + + +def post_discord(webhook_url: str, title: str, description: str) -> None: + embed = { + "title": title, + "description": description[:4000], + "color": 0x3498DB, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + payload = {"username": "Boost Data Collector", "embeds": [embed]} + data = json.dumps(payload).encode("utf-8") + req = request.Request( + webhook_url, + data=data, + headers={"Content-Type": "application/json"}, + ) + with request.urlopen(req, timeout=15) as resp: + if resp.status not in (200, 204): + logger.warning("Discord webhook returned status %s", resp.status) + + +def post_slack(webhook_url: str, title: str, text: str) -> None: + blocks = [ + { + "type": "header", + "text": {"type": "plain_text", "text": title, "emoji": True}, + }, + {"type": "section", "text": {"type": "mrkdwn", "text": f"```{text[:2800]}```"}}, + ] + payload = {"username": "Boost Data Collector", "blocks": blocks, "icon_emoji": ":white_check_mark:"} + data = json.dumps(payload).encode("utf-8") + req = request.Request( + webhook_url, + data=data, + headers={"Content-Type": "application/json"}, + ) + with request.urlopen(req, timeout=15) as resp: + if resp.status != 200: + logger.warning("Slack webhook returned status %s", resp.status) + + +class Command(BaseCommand): + help = "Send startup/deploy notification to Slack and Discord webhooks." + + def handle(self, *args, **options): + if not getattr(settings, "ENABLE_STARTUP_NOTIFICATIONS", True): + logger.info("Startup notifications disabled (ENABLE_STARTUP_NOTIFICATIONS).") + return + + discord_url = (getattr(settings, "DISCORD_WEBHOOK_URL", None) or "").strip() + slack_url = (getattr(settings, "SLACK_WEBHOOK_URL", None) or "").strip() + if not discord_url and not slack_url: + logger.info("No DISCORD_WEBHOOK_URL or SLACK_WEBHOOK_URL; skipping notification.") + return + + notify_at = datetime.now(timezone.utc) + branch = os.environ.get("DEPLOY_BRANCH", "").strip() or "unknown" + + db_line = "DB: error" + try: + connection.ensure_connection() + tables = connection.introspection.table_names() + db_line = f"DB: OK, {len(tables)} tables" + except Exception as exc: + db_line = f"DB: failed ({exc})" + + beat_schedule = dict(celery_app.conf.beat_schedule or {}) + beat_lines, beat_total = collect_beat_lines(beat_schedule) + shown = beat_lines[:BEAT_LINES_CAP] + beat_block = "\n".join(shown) + if beat_total > len(shown): + beat_block += f"\n… and {beat_total - len(shown)} more" + + worker_line = "Celery workers: unknown" + try: + insp = celery_app.control.inspect(timeout=5.0) + pong = insp.ping() if insp else None + n = len(pong) if pong else 0 + worker_line = f"Celery workers: {n} (ping)" + except Exception as exc: + worker_line = f"Celery workers: inspect failed ({exc})" + + text_body = ( + f"Time (UTC): {notify_at.strftime('%Y-%m-%d %H:%M:%S')}\n" + f"Branch: {branch}\n" + f"{db_line}\n" + f"{worker_line}\n" + f"Celery beat entries: {beat_total}\n" + f"{beat_block if beat_block else '(none)'}" + ) + + title = "Boost Data Collector — stack healthy" + errors = [] + if discord_url: + try: + post_discord(discord_url, title, text_body) + except URLError as e: + errors.append(f"Discord: {e}") + except Exception as e: + errors.append(f"Discord: {e}") + if slack_url: + try: + post_slack(slack_url, title, text_body) + except URLError as e: + errors.append(f"Slack: {e}") + except Exception as e: + errors.append(f"Slack: {e}") + + if errors: + for err in errors: + logger.error("%s", err) + sys.exit(1) + + logger.info("Startup notification sent.") From ae865fbf7170be780b3329af81fc02ea42a9368b Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Sat, 28 Mar 2026 03:57:52 -0400 Subject: [PATCH 49/76] Fix: Lint/format error - #125 --- .../commands/send_startup_notification.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/core/management/commands/send_startup_notification.py b/core/management/commands/send_startup_notification.py index 8e6bd54..7d5253f 100644 --- a/core/management/commands/send_startup_notification.py +++ b/core/management/commands/send_startup_notification.py @@ -57,7 +57,12 @@ def describe_celery_schedule(sched) -> str: hours = _crontab_field_to_sorted_ints(sched.hour) minutes = _crontab_field_to_sorted_ints(sched.minute) parts = [] - if hours is not None and minutes is not None and len(hours) == 1 and len(minutes) == 1: + if ( + hours is not None + and minutes is not None + and len(hours) == 1 + and len(minutes) == 1 + ): parts.append(f"{hours[0]:02d}:{minutes[0]:02d} UTC") else: parts.append(f"crontab hour={sched.hour!r} minute={sched.minute!r}") @@ -110,7 +115,11 @@ def post_slack(webhook_url: str, title: str, text: str) -> None: }, {"type": "section", "text": {"type": "mrkdwn", "text": f"```{text[:2800]}```"}}, ] - payload = {"username": "Boost Data Collector", "blocks": blocks, "icon_emoji": ":white_check_mark:"} + payload = { + "username": "Boost Data Collector", + "blocks": blocks, + "icon_emoji": ":white_check_mark:", + } data = json.dumps(payload).encode("utf-8") req = request.Request( webhook_url, @@ -127,13 +136,17 @@ class Command(BaseCommand): def handle(self, *args, **options): if not getattr(settings, "ENABLE_STARTUP_NOTIFICATIONS", True): - logger.info("Startup notifications disabled (ENABLE_STARTUP_NOTIFICATIONS).") + logger.info( + "Startup notifications disabled (ENABLE_STARTUP_NOTIFICATIONS)." + ) return discord_url = (getattr(settings, "DISCORD_WEBHOOK_URL", None) or "").strip() slack_url = (getattr(settings, "SLACK_WEBHOOK_URL", None) or "").strip() if not discord_url and not slack_url: - logger.info("No DISCORD_WEBHOOK_URL or SLACK_WEBHOOK_URL; skipping notification.") + logger.info( + "No DISCORD_WEBHOOK_URL or SLACK_WEBHOOK_URL; skipping notification." + ) return notify_at = datetime.now(timezone.utc) From 4369f9028c905436c97523adb0a3ff8bf83a80c7 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Sat, 28 Mar 2026 04:49:33 -0400 Subject: [PATCH 50/76] Fix: workspace and logs folder in docker compose --- docker-compose.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7b01b88..5d07308 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -58,8 +58,8 @@ services: ALLOWED_HOSTS: localhost,127.0.0.1,web,0.0.0.0 SELENIUM_HUB_URL: http://selenium:4444/wd/hub volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs depends_on: # db: { condition: service_healthy } redis: { condition: service_healthy } @@ -74,8 +74,8 @@ services: CELERY_RESULT_BACKEND: redis://redis:6379/0 SELENIUM_HUB_URL: http://selenium:4444/wd/hub volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs command: celery -A config worker -l info depends_on: # db: { condition: service_healthy } @@ -90,8 +90,8 @@ services: CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 volumes: - - workspace_data:/app/workspace - - logs_data:/app/logs + - ./workspace:/app/workspace + - ./logs:/app/logs - celerybeat_data:/app/celerybeat command: celery -A config beat -l info depends_on: @@ -101,6 +101,6 @@ services: volumes: postgres_data: - workspace_data: - logs_data: + # workspace_data: + # logs_data: celerybeat_data: From 1885fec657773d53a138c6f894f0c6c9ad879281 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 13:54:07 -0400 Subject: [PATCH 51/76] fix(dashboard): reliable publish and CLI cleanup - #132 --- .env.example | 13 +- boost_library_usage_dashboard/analyzer.py | 3 +- .../run_boost_library_usage_dashboard.py | 191 +++++++----------- boost_library_usage_dashboard/publisher.py | 97 +++++++++ .../tests/test_analyzer.py | 1 - .../tests/test_command.py | 84 ++++---- config/settings.py | 17 +- github_ops/git_ops.py | 15 ++ 8 files changed, 237 insertions(+), 184 deletions(-) create mode 100644 boost_library_usage_dashboard/publisher.py diff --git a/.env.example b/.env.example index 4c3c5b1..ea09ef2 100644 --- a/.env.example +++ b/.env.example @@ -170,17 +170,18 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # REPO_COUNT_LANGUAGES=C++,Python,Rust # ============================================================================= -# Boost Library Usage Dashboard (optional; for --publish) +# Boost Library Usage Dashboard # ============================================================================= -# When set, run_boost_library_usage_dashboard --publish uses a persistent clone -# at raw/boost_library_usage_dashboard// (clone if missing, pull, copy, push). +# Target repo for publishing (run_boost_library_usage_dashboard without --skip-publish). +# Clone/pull/push uses GITHUB_TOKEN_WRITE (see GitHub tokens above). # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER=your-org # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO=your-dashboard-repo -# Token for clone/pull/push (defaults to GITHUB_TOKEN_WRITE if unset) -# BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN=ghp_xxxx -# Branch to publish to # BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH=main +# Git commit author identity used when publishing (defaults shown) +# GIT_AUTHOR_NAME=unknown +# GIT_AUTHOR_EMAIL=unknown@noreply.github.com + # ============================================================================= # Workspace (optional; default: project_root/workspace) # ============================================================================= diff --git a/boost_library_usage_dashboard/analyzer.py b/boost_library_usage_dashboard/analyzer.py index d9063ed..0ffa304 100644 --- a/boost_library_usage_dashboard/analyzer.py +++ b/boost_library_usage_dashboard/analyzer.py @@ -40,8 +40,7 @@ class BoostUsageDashboardAnalyzer: - def __init__(self, base_dir: Path, output_dir: Path): - self.base_dir = base_dir + def __init__(self, output_dir: Path): self.output_dir = output_dir self.dashboard_data_file = output_dir / "dashboard_data.json" self.report_file = output_dir / "Boost_Usage_Report_total.md" diff --git a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py index b41144d..02899b5 100644 --- a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py +++ b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py @@ -1,17 +1,13 @@ import logging -import shutil -from datetime import datetime -from pathlib import Path -from zoneinfo import ZoneInfo from django.conf import settings -from django.core.management.base import BaseCommand, CommandError +from django.core.management.base import BaseCommand from boost_library_usage_dashboard.analyzer import BoostUsageDashboardAnalyzer +from boost_library_usage_dashboard.publisher import publish_dashboard from boost_library_usage_dashboard.renderer import render_dashboard_html from boost_library_usage_dashboard.report import write_summary_report from config.workspace import get_workspace_path -from github_ops.git_ops import clone_repo, pull, push logger = logging.getLogger(__name__) @@ -19,154 +15,103 @@ class Command(BaseCommand): help = ( "Generate Boost library usage report/dashboard from PostgreSQL data, " - "then optionally publish generated files to a target GitHub repository." + "then publish generated files to a target GitHub repository unless skipped." ) def add_arguments(self, parser): parser.add_argument( - "--publish", + "--skip-collect", action="store_true", - help="Publish generated files to the repository configured in settings.", + help="Skip PostgreSQL collection and Markdown report generation.", ) parser.add_argument( - "--target-branch", + "--skip-render", + action="store_true", + help="Skip HTML rendering.", + ) + parser.add_argument( + "--skip-publish", + action="store_true", + help="Skip publishing to the configured GitHub repository.", + ) + parser.add_argument( + "--owner", + type=str, + default="", + help="Publish repo owner (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER).", + ) + parser.add_argument( + "--repo", type=str, - default="main", - help="Branch for pushing generated dashboard files.", + default="", + help="Publish repo name (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO).", ) parser.add_argument( - "--output-dir", + "--branch", type=str, default="", - help="Custom output directory. Defaults to workspace/boost_library_usage_dashboard.", + help="Branch to publish to (overrides BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH; default main).", ) def handle(self, *args, **options): - output_dir = ( - Path(options["output_dir"]).resolve() - if options["output_dir"] - else get_workspace_path("boost_library_usage_dashboard") - ) + output_dir = get_workspace_path("boost_library_usage_dashboard").resolve() output_dir.mkdir(parents=True, exist_ok=True) - self.stdout.write("Step 1: Collecting dashboard data from PostgreSQL...") - analyzer = BoostUsageDashboardAnalyzer( - base_dir=settings.BASE_DIR, output_dir=output_dir - ) - stats = analyzer.run() + skip_collect = options["skip_collect"] + skip_render = options["skip_render"] + skip_publish = options["skip_publish"] - self.stdout.write("Step 2: Writing Markdown report...") - write_summary_report( - analyzer.report_file, - stats, - stars_min_threshold=analyzer.stars_min_threshold, - ) + if not skip_collect: + logger.info("Step 1: Collecting dashboard data from PostgreSQL...") + analyzer = BoostUsageDashboardAnalyzer(output_dir=output_dir) + stats = analyzer.run() - self.stdout.write("Step 3: Rendering HTML files...") - render_dashboard_html(base_dir=settings.BASE_DIR, output_dir=output_dir) + logger.info("Step 2: Writing Markdown report...") + write_summary_report( + analyzer.report_file, + stats, + stars_min_threshold=analyzer.stars_min_threshold, + ) - self.stdout.write( - self.style.SUCCESS(f"Dashboard artifacts generated at: {output_dir}") - ) + if not skip_render: + logger.info("Step 3: Rendering HTML files...") + render_dashboard_html(base_dir=settings.BASE_DIR, output_dir=output_dir) - if options["publish"]: - owner = ( + if not skip_collect or not skip_render: + logger.info("Dashboard artifacts at: %s", output_dir) + + if not skip_publish: + owner = (options["owner"] or "").strip() or ( getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "") or "" ).strip() - repo = ( + repo = (options["repo"] or "").strip() or ( getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", "") or "" ).strip() branch = ( - getattr( - settings, - "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", - "", + (options["branch"] or "").strip() + or ( + getattr( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", + "", + ) + or "" + ).strip() + or "main" + ) + + if not owner or not repo: + logger.warning( + "Skipping publish: set BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER " + "and BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO in settings, or pass " + "--owner and --repo." ) - or "" - ).strip() or options["target_branch"] - if owner and repo: - self._publish_via_raw_clone( + else: + publish_dashboard( output_dir=output_dir, owner=owner, repo=repo, branch=branch, ) - else: - raise CommandError( - "Cannot publish: BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER " - "and BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO must be set in settings." - ) - - def _publish_via_raw_clone( - self, - output_dir: Path, - owner: str, - repo: str, - branch: str, - ) -> None: - """ - Publish using persistent clone at raw/boost_library_usage_dashboard/owner/repo. - Clone if missing, pull, remove contents, copy output_dir, add/commit/push. - """ - clone_dir = ( - Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo - ) - clone_dir = clone_dir.resolve() - output_dir = output_dir.resolve() - if ( - clone_dir == output_dir - or clone_dir in output_dir.parents - or output_dir in clone_dir.parents - ): - raise CommandError( - "--output-dir must not overlap with the publish clone path: " - f"{clone_dir}" - ) - clone_dir.parent.mkdir(parents=True, exist_ok=True) - token = ( - getattr(settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN", None) - or None - ) - repo_slug = f"{owner}/{repo}" - self.stdout.write( - f"Publishing dashboard artifacts to {repo_slug} ({branch})..." - ) - if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): - if clone_dir.exists(): - shutil.rmtree(clone_dir) - self.stdout.write(f"Cloning {repo_slug} to {clone_dir}...") - clone_repo(repo_slug, clone_dir, token=token) - self.stdout.write("Pulling latest...") - pull(clone_dir, branch=branch, token=token) - for child in clone_dir.iterdir(): - if child.name == ".git": - continue - if child.is_dir() and child.name == "develop": - shutil.rmtree(child) - publish_subdir = clone_dir / "develop" - publish_subdir.mkdir(parents=True, exist_ok=True) - for child in output_dir.iterdir(): - dest = publish_subdir / child.name - if child.is_dir(): - shutil.copytree(child, dest) - else: - if child.suffix != ".html": - continue - shutil.copy2(child, dest) - tz_name = getattr(settings, "CELERY_TIMEZONE", None) or settings.TIME_ZONE - commit_time = datetime.now(ZoneInfo(tz_name)).strftime("%Y-%m-%d %H:%M:%S") - commit_message = ( - f"Update Boost library usage dashboard artifacts ({commit_time})" - ) - push( - clone_dir, - remote="origin", - branch=branch, - commit_message=commit_message, - token=token, - ) - self.stdout.write( - self.style.SUCCESS("Dashboard artifacts published successfully.") - ) diff --git a/boost_library_usage_dashboard/publisher.py b/boost_library_usage_dashboard/publisher.py new file mode 100644 index 0000000..533fb9b --- /dev/null +++ b/boost_library_usage_dashboard/publisher.py @@ -0,0 +1,97 @@ +"""Publish Boost library usage dashboard artifacts to a GitHub repository.""" + +from __future__ import annotations + +import logging +import shutil +from datetime import datetime, timezone +from pathlib import Path + +from django.conf import settings +from django.core.management.base import CommandError + +from github_ops.git_ops import clone_repo, pull, push + +logger = logging.getLogger(__name__) + + +def publish_dashboard( + output_dir: Path, + owner: str, + repo: str, + branch: str, +) -> None: + """ + Publish using a persistent clone at raw/boost_library_usage_dashboard//. + Clone if missing, pull, sync ``develop/`` from output_dir, commit, push. + + Uses ``settings.GITHUB_TOKEN_WRITE`` for clone/pull/push and + ``settings.GIT_AUTHOR_NAME`` / ``settings.GIT_AUTHOR_EMAIL`` for the commit + identity (via env vars on ``git commit`` only). + """ + clone_dir = ( + Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo + ) + clone_dir = clone_dir.resolve() + output_dir = output_dir.resolve() + if ( + clone_dir == output_dir + or clone_dir in output_dir.parents + or output_dir in clone_dir.parents + ): + raise CommandError( + "Workspace output directory must not overlap with the publish clone path: " + f"{clone_dir}" + ) + + clone_dir.parent.mkdir(parents=True, exist_ok=True) + token = (getattr(settings, "GITHUB_TOKEN_WRITE", None) or "").strip() or None + git_user_name = (getattr(settings, "GIT_AUTHOR_NAME", None) or "unknown").strip() + git_user_email = ( + getattr(settings, "GIT_AUTHOR_EMAIL", None) or "unknown@noreply.github.com" + ).strip() + + repo_slug = f"{owner}/{repo}" + logger.info("Publishing dashboard artifacts to %s (%s)...", repo_slug, branch) + + if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): + if clone_dir.exists(): + shutil.rmtree(clone_dir) + logger.info("Cloning %s to %s", repo_slug, clone_dir) + clone_repo(repo_slug, clone_dir, token=token) + + logger.info("Pulling latest for %s", clone_dir) + pull(clone_dir, branch=branch, token=token) + + for child in clone_dir.iterdir(): + if child.name == ".git": + continue + if child.is_dir() and child.name == "develop": + shutil.rmtree(child) + + publish_subdir = clone_dir / "develop" + publish_subdir.mkdir(parents=True, exist_ok=True) + + for child in output_dir.iterdir(): + dest = publish_subdir / child.name + if child.is_dir(): + shutil.copytree(child, dest) + else: + if child.suffix != ".html": + continue + shutil.copy2(child, dest) + + commit_time = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + commit_message = ( + f"Update Boost library usage dashboard artifacts ({commit_time})" + ) + push( + clone_dir, + remote="origin", + branch=branch, + commit_message=commit_message, + token=token, + git_user_name=git_user_name, + git_user_email=git_user_email, + ) + logger.info("Dashboard artifacts published successfully to %s.", repo_slug) diff --git a/boost_library_usage_dashboard/tests/test_analyzer.py b/boost_library_usage_dashboard/tests/test_analyzer.py index f5c8461..2ab0f39 100644 --- a/boost_library_usage_dashboard/tests/test_analyzer.py +++ b/boost_library_usage_dashboard/tests/test_analyzer.py @@ -9,7 +9,6 @@ def _make_analyzer() -> BoostUsageDashboardAnalyzer: analyzer = BoostUsageDashboardAnalyzer.__new__(BoostUsageDashboardAnalyzer) - analyzer.base_dir = Path(tempfile.gettempdir()) / "boost-dashboard-test-base" analyzer.output_dir = Path(tempfile.gettempdir()) / "boost-dashboard-test-output" analyzer.version_name_list = ["1.50.0", "1.51.0", "1.52.0", "1.53.0", "1.54.0"] analyzer.repo_info = [] diff --git a/boost_library_usage_dashboard/tests/test_command.py b/boost_library_usage_dashboard/tests/test_command.py index a13c8b3..67a22ec 100644 --- a/boost_library_usage_dashboard/tests/test_command.py +++ b/boost_library_usage_dashboard/tests/test_command.py @@ -1,13 +1,11 @@ """Tests for run_boost_library_usage_dashboard command.""" -from io import StringIO from pathlib import Path from unittest.mock import MagicMock, patch import pytest from django.conf import settings from django.core.management import call_command, get_commands -from django.core.management.base import CommandError @pytest.mark.django_db @@ -23,24 +21,20 @@ def test_dashboard_command_runs_generation_only(dashboard_cmd_name, tmp_path): fake_analyzer.report_file = tmp_path / "Boost_Usage_Report_total.md" fake_analyzer.stars_min_threshold = 10 - out = StringIO() - err = StringIO() - with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ) as analyzer_cls, patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.write_summary_report" ) as write_report, patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" - ) as render_html: - call_command( - dashboard_cmd_name, - "--output-dir", - str(tmp_path), - stdout=out, - stderr=err, - ) + ) as render_html, patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock: + call_command(dashboard_cmd_name, "--skip-publish") analyzer_cls.assert_called_once() fake_analyzer.run.assert_called_once() @@ -54,13 +48,14 @@ def test_dashboard_command_runs_generation_only(dashboard_cmd_name, tmp_path): base_dir=settings.BASE_DIR, output_dir=expected_output_dir, ) + publish_mock.assert_not_called() @pytest.mark.django_db -def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( +def test_dashboard_command_publish_with_owner_repo_calls_publish_dashboard( dashboard_cmd_name, tmp_path ): - """When --publish and settings have owner/repo, _publish_via_raw_clone is called.""" + """When owner/repo are set (settings or CLI), publish_dashboard is called.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "Boost_Usage_Report_total.md" @@ -68,6 +63,9 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( (tmp_path / "index.html").write_text("") with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -75,8 +73,8 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "myorg", @@ -91,15 +89,12 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( ): call_command( dashboard_cmd_name, - "--publish", - "--target-branch", + "--branch", "gh-pages", - "--output-dir", - str(tmp_path), ) - publish_raw_mock.assert_called_once() - call_kw = publish_raw_mock.call_args[1] + publish_mock.assert_called_once() + call_kw = publish_mock.call_args[1] assert call_kw["owner"] == "myorg" assert call_kw["repo"] == "my-repo" assert call_kw["branch"] == "gh-pages" @@ -110,13 +105,16 @@ def test_dashboard_command_publish_with_owner_repo_calls_publish_via_raw_clone( def test_dashboard_command_publish_uses_branch_from_settings_when_set( dashboard_cmd_name, tmp_path ): - """When BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH is set, it is passed to _publish_via_raw_clone.""" + """When BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH is set, it is used if --branch omitted.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "report.md" fake_analyzer.stars_min_threshold = 10 with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -124,8 +122,8 @@ def test_dashboard_command_publish_uses_branch_from_settings_when_set( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "org", @@ -138,29 +136,25 @@ def test_dashboard_command_publish_uses_branch_from_settings_when_set( "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", "publish-branch", ): - call_command( - dashboard_cmd_name, - "--publish", - "--target-branch", - "main", - "--output-dir", - str(tmp_path), - ) + call_command(dashboard_cmd_name) - assert publish_raw_mock.call_args[1]["branch"] == "publish-branch" + assert publish_mock.call_args[1]["branch"] == "publish-branch" @pytest.mark.django_db -def test_dashboard_command_publish_no_owner_repo_raises_command_error( +def test_dashboard_command_publish_no_owner_repo_skips_publish( dashboard_cmd_name, tmp_path ): - """When --publish but owner or repo missing in settings, CommandError is raised.""" + """When owner and repo are missing, publish is skipped (no CommandError).""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "report.md" fake_analyzer.stars_min_threshold = 10 with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", return_value=fake_analyzer, ), patch( @@ -168,8 +162,8 @@ def test_dashboard_command_publish_no_owner_repo_raises_command_error( ), patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" ), patch( - "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.Command._publish_via_raw_clone" - ) as publish_raw_mock, patch.object( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( settings, "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", "", @@ -178,12 +172,6 @@ def test_dashboard_command_publish_no_owner_repo_raises_command_error( "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", "", ): - with pytest.raises(CommandError) as exc_info: - call_command( - dashboard_cmd_name, - "--publish", - "--output-dir", - str(tmp_path), - ) - assert "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH" in str(exc_info.value) - publish_raw_mock.assert_not_called() + call_command(dashboard_cmd_name) + + publish_mock.assert_not_called() diff --git a/config/settings.py b/config/settings.py index 8f5cfde..91e57a1 100644 --- a/config/settings.py +++ b/config/settings.py @@ -289,19 +289,28 @@ env("BOOST_LIBRARY_TRACKER_REPO_BRANCH", default="master") or "master" ).strip() -# Settings for publishing boost_library_usage_dashboard +# ============================================================================= +# Boost Library Usage Dashboard +# run_boost_library_usage_dashboard writes artifacts under the workspace, then +# optionally publishes to the GitHub repo below (unless --skip-publish). Clone, +# pull, and push use GITHUB_TOKEN_WRITE. If PUBLISH_OWNER / PUBLISH_REPO are +# unset, publish is skipped (CLI --owner / --repo can override). GIT_AUTHOR_* +# set commit author for that push only (via git env vars, not git config). +# ============================================================================= BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", default="") or "" ).strip() BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", default="") or "" ).strip() -BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN = ( - env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_TOKEN", default="") or "" -).strip() or GITHUB_TOKEN_WRITE BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH = ( env("BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_BRANCH", default="") or "" ).strip() +GIT_AUTHOR_NAME = (env("GIT_AUTHOR_NAME", default="unknown") or "unknown").strip() +GIT_AUTHOR_EMAIL = ( + env("GIT_AUTHOR_EMAIL", default="unknown@noreply.github.com") + or "unknown@noreply.github.com" +).strip() # Slack (bot + app token for operations.slack_ops and cppa_slack_transcript_tracker) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 38a16fd..236c2be 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -179,12 +179,17 @@ def push( commit_message: Optional[str] = None, add_paths: Optional[list[str | Path]] = None, token: Optional[str] = None, + git_user_name: Optional[str] = None, + git_user_email: Optional[str] = None, ) -> None: """ Push to remote. Uses push token by default. Always runs git add, git commit, then push. Uses commit_message if provided, otherwise "Auto commit in ". add_paths: paths to add (relative to repo_dir); if None, adds all (git add .). + + git_user_name / git_user_email: if set, passed only to the ``git commit`` subprocess + via GIT_AUTHOR_* / GIT_COMMITTER_* env vars (does not modify repo ``git config``). """ repo_dir = Path(repo_dir) if token is None: @@ -203,10 +208,20 @@ def push( capture_output=True, text=True, ) + commit_env = dict(os.environ) + if git_user_name: + commit_env["GIT_AUTHOR_NAME"] = git_user_name + commit_env["GIT_COMMITTER_NAME"] = git_user_name + if git_user_email: + commit_env["GIT_AUTHOR_EMAIL"] = git_user_email + commit_env["GIT_COMMITTER_EMAIL"] = git_user_email commit_result = subprocess.run( ["git", "-C", str(repo_dir), "commit", "-m", message], capture_output=True, text=True, + encoding="utf-8", + errors="replace", + env=commit_env, ) if commit_result.returncode != 0: out = (commit_result.stderr or "") + (commit_result.stdout or "") From 92944da8b6aee1f235321948e33dbe86f57ca545 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 13:57:35 -0400 Subject: [PATCH 52/76] Fix: lint/format error - #132 --- boost_library_usage_dashboard/publisher.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/boost_library_usage_dashboard/publisher.py b/boost_library_usage_dashboard/publisher.py index 533fb9b..884ab3c 100644 --- a/boost_library_usage_dashboard/publisher.py +++ b/boost_library_usage_dashboard/publisher.py @@ -29,9 +29,7 @@ def publish_dashboard( ``settings.GIT_AUTHOR_NAME`` / ``settings.GIT_AUTHOR_EMAIL`` for the commit identity (via env vars on ``git commit`` only). """ - clone_dir = ( - Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo - ) + clone_dir = Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo clone_dir = clone_dir.resolve() output_dir = output_dir.resolve() if ( @@ -82,9 +80,7 @@ def publish_dashboard( shutil.copy2(child, dest) commit_time = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") - commit_message = ( - f"Update Boost library usage dashboard artifacts ({commit_time})" - ) + commit_message = f"Update Boost library usage dashboard artifacts ({commit_time})" push( clone_dir, remote="origin", From 3968a2fa2c8d8de341b31194bd1def1358723b1f Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 14:53:07 -0400 Subject: [PATCH 53/76] add 403 error fix logic in upload folder to github - #134 --- github_ops/git_ops.py | 57 +++++++++++++++++++++++------ github_ops/tests/test_git_ops.py | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 11 deletions(-) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 38a16fd..5a31cef 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -9,6 +9,7 @@ import base64 import logging import os +import random import re import subprocess import threading @@ -26,9 +27,14 @@ logger = logging.getLogger(__name__) # Fewer workers to avoid GitHub secondary rate limit (403 when too many concurrent requests) -_UPLOAD_FOLDER_MAX_WORKERS = 8 +_UPLOAD_FOLDER_MAX_WORKERS = 4 _UPLOAD_FOLDER_BLOB_RETRIES = 5 -_UPLOAD_FOLDER_403_WAIT_SEC = 60 +# Cap concurrent blob POSTs across all executor threads (primary + secondary limit relief) +_UPLOAD_FOLDER_BLOB_MAX_CONCURRENT = 3 +# Max seconds to sleep in one wait after 403 (avoid unbounded sleeps from bad headers) +_UPLOAD_FOLDER_403_MAX_SLEEP_SEC = 900 + +_blob_post_semaphore = threading.BoundedSemaphore(_UPLOAD_FOLDER_BLOB_MAX_CONCURRENT) _thread_local = threading.local() @@ -50,6 +56,40 @@ def _get_worker_session(token: str) -> requests.Session: return _thread_local.session +def _wait_seconds_for_github_403(r: requests.Response, attempt: int) -> float: + """Sleep duration after a 403 from GitHub (primary limit, Retry-After, or fallback).""" + max_sleep = float(_UPLOAD_FOLDER_403_MAX_SLEEP_SEC) + h = r.headers + + remaining = h.get("X-RateLimit-Remaining") + reset_raw = h.get("X-RateLimit-Reset") + try: + if remaining is not None and int(remaining) == 0 and reset_raw is not None: + reset_ts = int(reset_raw) + wait = max(0.0, float(reset_ts) - time.time()) + if wait < 1.0: + wait = 1.0 + wait += random.uniform(0, 2) + return min(wait, max_sleep) + except (TypeError, ValueError): + pass + + ra = h.get("Retry-After") + if ra is not None: + try: + wait = float(ra) + if wait < 1.0: + wait = 1.0 + wait += random.uniform(0, 1) + return min(wait, max_sleep) + except (TypeError, ValueError): + pass + + base = 5.0 * (2.0**attempt) + wait = min(base + random.uniform(0, 2), max_sleep) + return wait + + def _create_blob_with_retry( base: str, token: str, repo_path: str, local_path: Path ) -> tuple[str, str]: @@ -64,18 +104,13 @@ def _create_blob_with_retry( last_err = None for attempt in range(_UPLOAD_FOLDER_BLOB_RETRIES): try: - r = session.post(url, json=blob_data, timeout=30) + with _blob_post_semaphore: + r = session.post(url, json=blob_data, timeout=30) if r.status_code == 403: - # GitHub secondary rate limit; wait and retry (cap at our constant) - wait_sec = _UPLOAD_FOLDER_403_WAIT_SEC - try: - from_header = int(r.headers.get("Retry-After", wait_sec)) - wait_sec = min(from_header, _UPLOAD_FOLDER_403_WAIT_SEC) - except (TypeError, ValueError): - pass + wait_sec = _wait_seconds_for_github_403(r, attempt) if attempt < _UPLOAD_FOLDER_BLOB_RETRIES - 1: logger.warning( - "Blob upload 403 (rate limit), waiting %ss before retry (%s)", + "Blob upload 403 (rate limit), waiting %.1fs before retry (%s)", wait_sec, repo_path, ) diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 94201bc..84c7664 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -496,6 +496,67 @@ def test_create_blob_with_retry_returns_sha_on_success(): mock_session.post.assert_called_once() +def test_create_blob_with_retry_403_waits_using_rate_limit_reset(): + """_create_blob_with_retry sleeps until X-RateLimit-Reset when Remaining is 0.""" + mock_403 = MagicMock() + mock_403.status_code = 403 + mock_403.headers = { + "X-RateLimit-Remaining": "0", + "X-RateLimit-Reset": "1007", + } + mock_ok = MagicMock() + mock_ok.status_code = 201 + mock_ok.json.return_value = {"sha": "sha_after_reset_wait"} + mock_ok.raise_for_status = MagicMock() + + mock_session = MagicMock() + mock_session.post.side_effect = [mock_403, mock_ok] + mock_path = MagicMock() + mock_path.read_bytes.return_value = b"x" + + with patch("github_ops.git_ops._get_worker_session", return_value=mock_session): + with patch("github_ops.git_ops.time.time", return_value=1000.0): + with patch("github_ops.git_ops.random.uniform", return_value=0.0): + with patch("github_ops.git_ops.time.sleep") as sleep_mock: + out = _create_blob_with_retry( + "https://api.github.com/repos/o/r", + "token", + "f.txt", + mock_path, + ) + assert out == ("f.txt", "sha_after_reset_wait") + sleep_mock.assert_called_once_with(7.0) + assert mock_session.post.call_count == 2 + + +def test_create_blob_with_retry_403_exponential_when_no_headers(): + """_create_blob_with_retry uses exponential backoff on 403 without rate-limit headers.""" + mock_403 = MagicMock() + mock_403.status_code = 403 + mock_403.headers = {} + mock_ok = MagicMock() + mock_ok.status_code = 201 + mock_ok.json.return_value = {"sha": "sha_ok"} + mock_ok.raise_for_status = MagicMock() + + mock_session = MagicMock() + mock_session.post.side_effect = [mock_403, mock_ok] + mock_path = MagicMock() + mock_path.read_bytes.return_value = b"x" + + with patch("github_ops.git_ops._get_worker_session", return_value=mock_session): + with patch("github_ops.git_ops.random.uniform", return_value=0.0): + with patch("github_ops.git_ops.time.sleep") as sleep_mock: + out = _create_blob_with_retry( + "https://api.github.com/repos/o/r", + "token", + "f.txt", + mock_path, + ) + assert out == ("f.txt", "sha_ok") + sleep_mock.assert_called_once_with(5.0) + + # --- get_commit_file_changes --- From da881acf7b0ccda64e9f131d383c77ec20792c67 Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 15:13:30 -0400 Subject: [PATCH 54/76] fix(dashboard): bootstrap publish clone and block publish without HTML - #132 --- .../run_boost_library_usage_dashboard.py | 7 ++- boost_library_usage_dashboard/publisher.py | 8 ++- .../tests/test_command.py | 39 ++++++++++++ github_ops/git_ops.py | 62 +++++++++++++++++++ 4 files changed, 113 insertions(+), 3 deletions(-) diff --git a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py index 02899b5..03ab46f 100644 --- a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py +++ b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py @@ -1,7 +1,7 @@ import logging from django.conf import settings -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from boost_library_usage_dashboard.analyzer import BoostUsageDashboardAnalyzer from boost_library_usage_dashboard.publisher import publish_dashboard @@ -109,6 +109,11 @@ def handle(self, *args, **options): "--owner and --repo." ) else: + if not any(output_dir.rglob("*.html")): + raise CommandError( + "Refusing to publish: no HTML artifacts were found in " + f"{output_dir}. Run without --skip-render first." + ) publish_dashboard( output_dir=output_dir, owner=owner, diff --git a/boost_library_usage_dashboard/publisher.py b/boost_library_usage_dashboard/publisher.py index 884ab3c..4d803f2 100644 --- a/boost_library_usage_dashboard/publisher.py +++ b/boost_library_usage_dashboard/publisher.py @@ -10,7 +10,7 @@ from django.conf import settings from django.core.management.base import CommandError -from github_ops.git_ops import clone_repo, pull, push +from github_ops.git_ops import clone_repo, prepare_repo_for_pull, pull, push logger = logging.getLogger(__name__) @@ -23,7 +23,8 @@ def publish_dashboard( ) -> None: """ Publish using a persistent clone at raw/boost_library_usage_dashboard//. - Clone if missing, pull, sync ``develop/`` from output_dir, commit, push. + Clone if missing, then fetch/clean/reset the clone, pull, sync ``develop/`` from + output_dir, commit, push. Uses ``settings.GITHUB_TOKEN_WRITE`` for clone/pull/push and ``settings.GIT_AUTHOR_NAME`` / ``settings.GIT_AUTHOR_EMAIL`` for the commit @@ -58,6 +59,9 @@ def publish_dashboard( logger.info("Cloning %s to %s", repo_slug, clone_dir) clone_repo(repo_slug, clone_dir, token=token) + logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) + prepare_repo_for_pull(clone_dir, remote="origin", token=token) + logger.info("Pulling latest for %s", clone_dir) pull(clone_dir, branch=branch, token=token) diff --git a/boost_library_usage_dashboard/tests/test_command.py b/boost_library_usage_dashboard/tests/test_command.py index 67a22ec..0693c93 100644 --- a/boost_library_usage_dashboard/tests/test_command.py +++ b/boost_library_usage_dashboard/tests/test_command.py @@ -6,6 +6,7 @@ import pytest from django.conf import settings from django.core.management import call_command, get_commands +from django.core.management.base import CommandError @pytest.mark.django_db @@ -110,6 +111,7 @@ def test_dashboard_command_publish_uses_branch_from_settings_when_set( fake_analyzer.run.return_value = {} fake_analyzer.report_file = tmp_path / "report.md" fake_analyzer.stars_min_threshold = 10 + (tmp_path / "index.html").write_text("") with patch( "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", @@ -175,3 +177,40 @@ def test_dashboard_command_publish_no_owner_repo_skips_publish( call_command(dashboard_cmd_name) publish_mock.assert_not_called() + + +@pytest.mark.django_db +def test_dashboard_command_publish_refuses_without_html_artifacts( + dashboard_cmd_name, tmp_path +): + """Publish with owner/repo but no *.html under output_dir raises CommandError.""" + with patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.get_workspace_path", + return_value=tmp_path, + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.BoostUsageDashboardAnalyzer", + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.write_summary_report" + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.render_dashboard_html" + ), patch( + "boost_library_usage_dashboard.management.commands.run_boost_library_usage_dashboard.publish_dashboard" + ) as publish_mock, patch.object( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_OWNER", + "org", + ), patch.object( + settings, + "BOOST_LIBRARY_USAGE_DASHBOARD_PUBLISH_REPO", + "repo", + ): + tmp_path.mkdir(parents=True, exist_ok=True) + (tmp_path / "dashboard_data.json").write_text("{}") + with pytest.raises(CommandError) as exc_info: + call_command( + dashboard_cmd_name, + "--skip-collect", + "--skip-render", + ) + assert "no HTML artifacts" in str(exc_info.value) + publish_mock.assert_not_called() diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 236c2be..0846d14 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -304,6 +304,68 @@ def pull( subprocess.run(cmd, check=True, capture_output=True, text=True) +def prepare_repo_for_pull( + repo_dir: str | Path, + *, + remote: str = "origin", + token: Optional[str] = None, +) -> None: + """ + Fetch remote branch refs (prune), remove untracked files, and reset the working tree. + + Use before checkout/pull on a reused clone that may have local changes or lack + remote-tracking refs for branches that exist only on the remote. + """ + repo_dir = Path(repo_dir) + if token is None: + token = get_github_token(use="push") + result = subprocess.run( + ["git", "-C", str(repo_dir), "remote", "get-url", remote], + capture_output=True, + text=True, + check=True, + ) + remote_url = result.stdout.strip() + auth_url = _url_with_token(remote_url, token or "") + + logger.info("Fetching %s refs (prune) in %s", remote, repo_dir) + subprocess.run( + [ + "git", + "-C", + str(repo_dir), + "fetch", + auth_url, + f"+refs/heads/*:refs/remotes/{remote}/*", + "--prune", + ], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=GIT_CMD_TIMEOUT_SECONDS, + ) + logger.info("Running git clean -fd in %s", repo_dir) + subprocess.run( + ["git", "-C", str(repo_dir), "clean", "-fd"], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + logger.info("Running git reset --hard in %s", repo_dir) + subprocess.run( + ["git", "-C", str(repo_dir), "reset", "--hard"], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + + def fetch_file_content( owner: str, repo: str, From 9a209390753c83f00874d698cafa1836d788935c Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 17:02:44 -0400 Subject: [PATCH 55/76] fix: harden dashboard publish paths, GitHub HTTPS auth, and git commit env - #132 --- .../run_boost_github_activity_tracker.py | 14 +---- .../management/__init__.py | 1 + .../management/commands/__init__.py | 1 + .../run_boost_library_usage_dashboard.py | 6 +++ boost_library_usage_dashboard/publisher.py | 34 +++++++++++- .../tests/fixtures.py | 1 + .../tests/test_command.py | 2 + .../tests/test_publisher.py | 54 +++++++++++++++++++ github_ops/git_ops.py | 22 ++++++-- github_ops/tests/test_git_ops.py | 12 ++--- 10 files changed, 123 insertions(+), 24 deletions(-) create mode 100644 boost_library_usage_dashboard/tests/test_publisher.py diff --git a/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py b/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py index c12e180..fcc9510 100644 --- a/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py +++ b/boost_library_tracker/management/commands/run_boost_github_activity_tracker.py @@ -121,8 +121,6 @@ def _push_markdown_to_github( all_new_files: dict[str, str], ) -> None: """Upload generated Markdown to BOOST_LIBRARY_TRACKER_REPO_*; unlink locals on success.""" - if not all_new_files: - return cfg = _markdown_export_repo_config() if not cfg: logger.error( @@ -504,17 +502,7 @@ def handle(self, *args, **options): if not skip_remote_push: logger.info("push Markdown to configured GitHub repo") - if not all_new_files: - if skip_markdown_export and not skip_github_sync: - logger.warning( - "nothing new to push (--skip-markdown-export); skipping remote push" - ) - elif skip_github_sync: - logger.warning( - "nothing to push from this run (sync was skipped)" - ) - else: - _push_markdown_to_github(md_output_dir, all_new_files) + _push_markdown_to_github(md_output_dir, all_new_files) else: logger.info("skipping remote push (--skip-remote-push)") diff --git a/boost_library_usage_dashboard/management/__init__.py b/boost_library_usage_dashboard/management/__init__.py index e69de29..a70886e 100644 --- a/boost_library_usage_dashboard/management/__init__.py +++ b/boost_library_usage_dashboard/management/__init__.py @@ -0,0 +1 @@ +"""Django management package for boost_library_usage_dashboard.""" diff --git a/boost_library_usage_dashboard/management/commands/__init__.py b/boost_library_usage_dashboard/management/commands/__init__.py index e69de29..edda154 100644 --- a/boost_library_usage_dashboard/management/commands/__init__.py +++ b/boost_library_usage_dashboard/management/commands/__init__.py @@ -0,0 +1 @@ +"""Management commands for the boost_library_usage_dashboard app.""" diff --git a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py index 03ab46f..0442e99 100644 --- a/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py +++ b/boost_library_usage_dashboard/management/commands/run_boost_library_usage_dashboard.py @@ -1,3 +1,5 @@ +"""Build the Boost library usage dashboard from DB data and optionally publish to GitHub.""" + import logging from django.conf import settings @@ -13,12 +15,15 @@ class Command(BaseCommand): + """Django management command: collect metrics, render HTML, optionally push to GitHub.""" + help = ( "Generate Boost library usage report/dashboard from PostgreSQL data, " "then publish generated files to a target GitHub repository unless skipped." ) def add_arguments(self, parser): + """Register skip flags and publish target overrides.""" parser.add_argument( "--skip-collect", action="store_true", @@ -54,6 +59,7 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): + """Run collect/render steps, then publish when configured and artifacts exist.""" output_dir = get_workspace_path("boost_library_usage_dashboard").resolve() output_dir.mkdir(parents=True, exist_ok=True) diff --git a/boost_library_usage_dashboard/publisher.py b/boost_library_usage_dashboard/publisher.py index 4d803f2..b09a18d 100644 --- a/boost_library_usage_dashboard/publisher.py +++ b/boost_library_usage_dashboard/publisher.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import re import shutil from datetime import datetime, timezone from pathlib import Path @@ -14,6 +15,25 @@ logger = logging.getLogger(__name__) +# GitHub owner/login and repository name: single path segment, no traversal. +_GITHUB_OWNER_REPO_SLUG = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?$") + + +def _validate_github_slug(label: str, value: str) -> str: + """Return stripped owner or repo name, or raise CommandError if unsafe or invalid.""" + v = (value or "").strip() + if not v: + raise CommandError(f"Invalid GitHub {label}: empty") + if v in (".", ".."): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if "/" in v or "\\" in v: + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if Path(v).is_absolute(): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if not _GITHUB_OWNER_REPO_SLUG.fullmatch(v): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + return v + def publish_dashboard( output_dir: Path, @@ -30,8 +50,18 @@ def publish_dashboard( ``settings.GIT_AUTHOR_NAME`` / ``settings.GIT_AUTHOR_EMAIL`` for the commit identity (via env vars on ``git commit`` only). """ - clone_dir = Path(settings.RAW_DIR) / "boost_library_usage_dashboard" / owner / repo - clone_dir = clone_dir.resolve() + owner = _validate_github_slug("owner", owner) + repo = _validate_github_slug("repo", repo) + + publish_root = (Path(settings.RAW_DIR) / "boost_library_usage_dashboard").resolve() + clone_dir = (publish_root / owner / repo).resolve() + try: + clone_dir.relative_to(publish_root) + except ValueError: + raise CommandError( + f"Publish clone path escapes dashboard publish root: {clone_dir}" + ) from None + output_dir = output_dir.resolve() if ( clone_dir == output_dir diff --git a/boost_library_usage_dashboard/tests/fixtures.py b/boost_library_usage_dashboard/tests/fixtures.py index 2994541..66c0ac5 100644 --- a/boost_library_usage_dashboard/tests/fixtures.py +++ b/boost_library_usage_dashboard/tests/fixtures.py @@ -5,4 +5,5 @@ @pytest.fixture def dashboard_cmd_name(): + """Name of the ``run_boost_library_usage_dashboard`` management command.""" return "run_boost_library_usage_dashboard" diff --git a/boost_library_usage_dashboard/tests/test_command.py b/boost_library_usage_dashboard/tests/test_command.py index 0693c93..7f91e11 100644 --- a/boost_library_usage_dashboard/tests/test_command.py +++ b/boost_library_usage_dashboard/tests/test_command.py @@ -11,12 +11,14 @@ @pytest.mark.django_db def test_dashboard_command_exists(dashboard_cmd_name): + """The dashboard management command is registered with Django.""" commands = get_commands() assert dashboard_cmd_name in commands @pytest.mark.django_db def test_dashboard_command_runs_generation_only(dashboard_cmd_name, tmp_path): + """Default collect+render runs; publish is skipped when ``--skip-publish`` is passed.""" fake_analyzer = MagicMock() fake_analyzer.run.return_value = {"total_repositories": 0} fake_analyzer.report_file = tmp_path / "Boost_Usage_Report_total.md" diff --git a/boost_library_usage_dashboard/tests/test_publisher.py b/boost_library_usage_dashboard/tests/test_publisher.py new file mode 100644 index 0000000..fa56512 --- /dev/null +++ b/boost_library_usage_dashboard/tests/test_publisher.py @@ -0,0 +1,54 @@ +"""Tests for boost_library_usage_dashboard.publisher validation.""" + +from unittest.mock import patch + +import pytest +from django.conf import settings +from django.core.management.base import CommandError + +from boost_library_usage_dashboard.publisher import publish_dashboard + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_owner_with_path_separator(tmp_path): + """Owner must be a single slug; path separators are rejected.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_dashboard( + tmp_path / "out", + owner="foo/bar", + repo="repo", + branch="main", + ) + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_dotdot_repo(tmp_path): + """Repo must not be path-like.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub repo"): + publish_dashboard( + tmp_path / "out", + owner="org", + repo="..", + branch="main", + ) + + +@pytest.mark.django_db +def test_publish_dashboard_rejects_invalid_slug_chars(tmp_path): + """Spaces and other disallowed characters are rejected.""" + raw = tmp_path / "raw" + raw.mkdir() + with patch.object(settings, "RAW_DIR", str(raw)): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_dashboard( + tmp_path / "out", + owner="bad name", + repo="repo", + branch="main", + ) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 0846d14..b91749a 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -105,12 +105,17 @@ def _create_blob_with_retry( def _url_with_token(url: str, token: str) -> str: - """Inject token into GitHub HTTPS URL for auth.""" + """Inject credentials into a GitHub HTTPS URL for Git over HTTPS. + + Uses ``x-access-token:`` as the userinfo segment. Required for + fine-grained PATs (``github_pat_...``); classic PATs work with this form too. + """ if not token: return url + auth = f"x-access-token:{token}" return re.sub( r"^(https://)(github\.com/)", - r"\1" + token + r"@\2", + r"\1" + auth + r"@\2", url, count=1, ) @@ -162,11 +167,13 @@ def clone_repo( ) raise except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] logger.warning( - "git clone failed (%s -> %s), returncode=%s", + "git clone failed (%s -> %s), returncode=%s, stderr/stdout_tail=%r", url_or_slug, dest_dir, e.returncode, + err_tail, ) raise @@ -190,6 +197,8 @@ def push( git_user_name / git_user_email: if set, passed only to the ``git commit`` subprocess via GIT_AUTHOR_* / GIT_COMMITTER_* env vars (does not modify repo ``git config``). + Any existing GIT_AUTHOR_* / GIT_COMMITTER_* entries are removed from the commit + environment first so ambient or Django-set values are not inherited when unset. """ repo_dir = Path(repo_dir) if token is None: @@ -209,6 +218,13 @@ def push( text=True, ) commit_env = dict(os.environ) + for _key in ( + "GIT_AUTHOR_NAME", + "GIT_AUTHOR_EMAIL", + "GIT_COMMITTER_NAME", + "GIT_COMMITTER_EMAIL", + ): + commit_env.pop(_key, None) if git_user_name: commit_env["GIT_AUTHOR_NAME"] = git_user_name commit_env["GIT_COMMITTER_NAME"] = git_user_name diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 94201bc..9d51906 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -26,10 +26,10 @@ def test_url_with_token_empty_token_returns_unchanged(): def test_url_with_token_injects_token_before_github_com(): - """_url_with_token injects token into HTTPS GitHub URL.""" + """_url_with_token uses x-access-token form for GitHub HTTPS Git auth.""" url = "https://github.com/owner/repo.git" out = _url_with_token(url, "secret") - assert out == "https://secret@github.com/owner/repo.git" + assert out == "https://x-access-token:secret@github.com/owner/repo.git" def test_url_with_token_none_like_token_returns_unchanged(): @@ -42,7 +42,7 @@ def test_url_with_token_only_replaces_first_occurrence(): """_url_with_token uses count=1 so only first https://github.com/ is modified.""" url = "https://github.com/boostorg/boost.git" out = _url_with_token(url, "tok") - assert out == "https://tok@github.com/boostorg/boost.git" + assert out == "https://x-access-token:tok@github.com/boostorg/boost.git" # --- clone_repo --- @@ -69,9 +69,9 @@ def test_clone_repo_slug_converted_to_https_url(tmp_path): with patch("github_ops.git_ops.subprocess.run", MagicMock()) as run_mock: clone_repo("owner/repo", tmp_path, token="t") call_args = run_mock.call_args[0][0] - assert ( - "https://github.com/owner/repo.git" in call_args[2] - or "t@github.com" in call_args[2] + clone_url = call_args[2] + assert "https://github.com/owner/repo.git" in clone_url or ( + "x-access-token:t@" in clone_url and "github.com/owner/repo.git" in clone_url ) From f68806028a0b346039bfe7917aff550a92a803cb Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Mon, 30 Mar 2026 19:30:52 -0400 Subject: [PATCH 56/76] Update: use 60s base for blob 403 fallback - #134 --- github_ops/git_ops.py | 2 +- github_ops/tests/test_git_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 5a31cef..6490079 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -85,7 +85,7 @@ def _wait_seconds_for_github_403(r: requests.Response, attempt: int) -> float: except (TypeError, ValueError): pass - base = 5.0 * (2.0**attempt) + base = 60.0 * (2.0**attempt) wait = min(base + random.uniform(0, 2), max_sleep) return wait diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 84c7664..475b2b9 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -554,7 +554,7 @@ def test_create_blob_with_retry_403_exponential_when_no_headers(): mock_path, ) assert out == ("f.txt", "sha_ok") - sleep_mock.assert_called_once_with(5.0) + sleep_mock.assert_called_once_with(60.0) # --- get_commit_file_changes --- From b870f17f5f561441e3ee9099b211b2309bc1631a Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Tue, 31 Mar 2026 13:40:04 -0400 Subject: [PATCH 57/76] Fix: Update unecessary logic - #134 --- github_ops/git_ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 6490079..5674796 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -66,9 +66,7 @@ def _wait_seconds_for_github_403(r: requests.Response, attempt: int) -> float: try: if remaining is not None and int(remaining) == 0 and reset_raw is not None: reset_ts = int(reset_raw) - wait = max(0.0, float(reset_ts) - time.time()) - if wait < 1.0: - wait = 1.0 + wait = max(1.0, float(reset_ts) - time.time()) wait += random.uniform(0, 2) return min(wait, max_sleep) except (TypeError, ValueError): From b8ccf7bace6920a9905366296f7419a9a4e877dc Mon Sep 17 00:00:00 2001 From: zho Date: Mon, 6 Apr 2026 23:00:46 +0800 Subject: [PATCH 58/76] #126-fixed file name and duplications --- ...t_preprocesser.py => test_preprocessor.py} | 0 config/settings.py | 91 +++++-------------- 2 files changed, 21 insertions(+), 70 deletions(-) rename boost_mailing_list_tracker/tests/{test_preprocesser.py => test_preprocessor.py} (100%) diff --git a/boost_mailing_list_tracker/tests/test_preprocesser.py b/boost_mailing_list_tracker/tests/test_preprocessor.py similarity index 100% rename from boost_mailing_list_tracker/tests/test_preprocesser.py rename to boost_mailing_list_tracker/tests/test_preprocessor.py diff --git a/config/settings.py b/config/settings.py index 6c9726d..5eb42c6 100644 --- a/config/settings.py +++ b/config/settings.py @@ -192,6 +192,13 @@ env("PINECONE_SPARSE_MODEL", default="pinecone-sparse-english-v0") or "pinecone-sparse-english-v0" ).strip() or "pinecone-sparse-english-v0" +# Slack → Pinecone namespace/app_type prefix (cppa_pinecone_sync / slack pipelines) +PINECONE_SLACK_NAMESPACE_PREFIX = ( + env("PINECONE_SLACK_NAMESPACE_PREFIX", default="slack") or "slack" +).strip() or "slack" +PINECONE_SLACK_APP_TYPE_PREFIX = ( + env("PINECONE_SLACK_APP_TYPE_PREFIX", default="slack") or "slack" +).strip() or "slack" # Pinecone sync: app_type and namespace per app (used when CLI does not pass --pinecone-app-type/--pinecone-namespace) # Boost Mailing List Tracker @@ -298,44 +305,30 @@ SLACK_TEAM_ID = (env("SLACK_TEAM_ID", default="") or "").strip() -def _slack_bot_token_from_env(): - """Build a dict of team_id -> bot token from SLACK_TEAM_IDS and SLACK_BOT_TOKEN_ env vars.""" - out = {} +def _slack_team_ids_from_env(): + """Comma-separated SLACK_TEAM_IDS → non-empty team id strings.""" ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() if not ids_raw: - return out - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue - key = f"SLACK_BOT_TOKEN_{tid}" - token = (env(key, default="") or "").strip() - if token: - out[tid] = token - return out - + return [] + return [tid.strip() for tid in ids_raw.split(",") if tid.strip()] -SLACK_BOT_TOKEN = _slack_bot_token_from_env() - -def _slack_app_token_from_env(): - """Build a dict of team_id -> app token from SLACK_TEAM_IDS and SLACK_APP_TOKEN_ env vars.""" +def _slack_per_team_tokens_from_env(env_key_prefix: str): + """ + Build team_id -> token from SLACK_TEAM_IDS and ``{prefix}_{team_id}`` env vars + (e.g. prefix SLACK_BOT_TOKEN → SLACK_BOT_TOKEN_T123). + """ out = {} - ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() - if not ids_raw: - return out - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue - key = f"SLACK_APP_TOKEN_{tid}" + for tid in _slack_team_ids_from_env(): + key = f"{env_key_prefix}_{tid}" token = (env(key, default="") or "").strip() if token: out[tid] = token return out -SLACK_APP_TOKEN = _slack_app_token_from_env() +SLACK_BOT_TOKEN = _slack_per_team_tokens_from_env("SLACK_BOT_TOKEN") +SLACK_APP_TOKEN = _slack_per_team_tokens_from_env("SLACK_APP_TOKEN") def _slack_team_scope_from_env(): @@ -346,14 +339,8 @@ def _slack_team_scope_from_env(): If SLACK_TEAM_SCOPE_ is missing or empty, that team gets [0, 1] (both). """ out = {} - ids_raw = (env("SLACK_TEAM_IDS", default="") or "").strip() - if not ids_raw: - return out valid_scopes = {0, 1} - for tid in ids_raw.split(","): - tid = tid.strip() - if not tid: - continue + for tid in _slack_team_ids_from_env(): key = f"SLACK_TEAM_SCOPE_{tid}" raw = (env(key, default="") or "").strip() if not raw: @@ -526,42 +513,6 @@ def _slack_team_scope_from_env(): ) CELERY_BEAT_SCHEDULE = {} -# ============================================================================= -# Pinecone (cppa_pinecone_sync) - vector index for RAG sync -# ============================================================================= -# Public API key (default). Used when instance=public or unset. -PINECONE_API_KEY = (env("PINECONE_API_KEY", default="") or "").strip() -# Private API key. Used when instance=private. -PINECONE_PRIVATE_API_KEY = (env("PINECONE_PRIVATE_API_KEY", default="") or "").strip() -# Index name (required for sync). Set in .env to enable Slack/mailing list → Pinecone. -PINECONE_INDEX_NAME = (env("PINECONE_INDEX_NAME", default="") or "").strip() -PINECONE_ENVIRONMENT = ( - env("PINECONE_ENVIRONMENT", default="us-east-1") or "us-east-1" -).strip() -PINECONE_CLOUD = (env("PINECONE_CLOUD", default="aws") or "aws").strip() -PINECONE_BATCH_SIZE = int(env("PINECONE_BATCH_SIZE", default="96") or "96") -PINECONE_UPDATE_MAX_WORKERS = int( - env("PINECONE_UPDATE_MAX_WORKERS", default="8") or "8" -) -PINECONE_CHUNK_SIZE = int(env("PINECONE_CHUNK_SIZE", default="1000") or "1000") -PINECONE_CHUNK_OVERLAP = int(env("PINECONE_CHUNK_OVERLAP", default="200") or "200") -PINECONE_MIN_TEXT_LENGTH = int(env("PINECONE_MIN_TEXT_LENGTH", default="50") or "50") -PINECONE_MIN_WORDS = int(env("PINECONE_MIN_WORDS", default="5") or "5") -PINECONE_SLACK_NAMESPACE_PREFIX = ( - env("PINECONE_SLACK_NAMESPACE_PREFIX", default="slack") or "slack" -).strip() -PINECONE_SLACK_APP_TYPE_PREFIX = ( - env("PINECONE_SLACK_APP_TYPE_PREFIX", default="slack") or "slack" -).strip() -PINECONE_DENSE_MODEL = ( - env("PINECONE_DENSE_MODEL", default="multilingual-e5-large") - or "multilingual-e5-large" -).strip() -PINECONE_SPARSE_MODEL = ( - env("PINECONE_SPARSE_MODEL", default="pinecone-sparse-english-v0") - or "pinecone-sparse-english-v0" -).strip() - # GitHub activity tracker: Redis for ETag cache (conditional GET). Use separate DB index. # To persist the cache across restarts, enable Redis persistence (RDB or AOF) in redis.conf: # RDB: leave default "save" rules (e.g. save 900 1) and set dir/dbfilename. From 763ac5439bc06b52f9bffcdc718bb3134eddd1ea Mon Sep 17 00:00:00 2001 From: zho Date: Mon, 6 Apr 2026 23:05:16 +0800 Subject: [PATCH 59/76] #126-fixed content typo of test file --- .../tests/test_preprocessor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/boost_mailing_list_tracker/tests/test_preprocessor.py b/boost_mailing_list_tracker/tests/test_preprocessor.py index fff8803..a12b6e1 100644 --- a/boost_mailing_list_tracker/tests/test_preprocessor.py +++ b/boost_mailing_list_tracker/tests/test_preprocessor.py @@ -11,7 +11,7 @@ @pytest.mark.django_db -def test_preprocesser_returns_empty_when_no_messages(): +def test_preprocessor_returns_empty_when_no_messages(): """No source rows -> empty docs and is_chunked=False.""" docs, is_chunked = preprocess_mailing_list_for_pinecone([], None) assert docs == [] @@ -19,7 +19,7 @@ def test_preprocesser_returns_empty_when_no_messages(): @pytest.mark.django_db -def test_preprocesser_first_sync_returns_all_messages( +def test_preprocessor_first_sync_returns_all_messages( mailing_list_profile, default_list_name, sample_sent_at, @@ -52,7 +52,7 @@ def test_preprocesser_first_sync_returns_all_messages( @pytest.mark.django_db -def test_preprocesser_incremental_by_created_at( +def test_preprocessor_incremental_by_created_at( mailing_list_profile, default_list_name, sample_sent_at, @@ -92,7 +92,7 @@ def test_preprocesser_incremental_by_created_at( @pytest.mark.django_db -def test_preprocesser_retries_failed_ids_even_if_old( +def test_preprocessor_retries_failed_ids_even_if_old( mailing_list_profile, default_list_name, sample_sent_at, @@ -124,7 +124,7 @@ def test_preprocesser_retries_failed_ids_even_if_old( @pytest.mark.django_db -def test_preprocesser_deduplicates_overlap_between_failed_and_incremental( +def test_preprocessor_deduplicates_overlap_between_failed_and_incremental( mailing_list_profile, default_list_name, sample_sent_at, @@ -153,7 +153,7 @@ def test_preprocesser_deduplicates_overlap_between_failed_and_incremental( @pytest.mark.django_db -def test_preprocesser_document_shape_and_metadata_fields( +def test_preprocessor_document_shape_and_metadata_fields( mailing_list_profile, default_list_name, sample_sent_at, @@ -198,7 +198,7 @@ def test_preprocesser_document_shape_and_metadata_fields( @pytest.mark.django_db -def test_preprocesser_handles_empty_body_with_metadata_fallback_content( +def test_preprocessor_handles_empty_body_with_metadata_fallback_content( mailing_list_profile, default_list_name, sample_sent_at, From edd54e40e72ab60b32690dcbea6e66815a4f6dba Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Tue, 31 Mar 2026 13:29:46 -0400 Subject: [PATCH 60/76] Update the clang_github_tracker --- clang_github_tracker/apps.py | 7 + .../commands/backfill_clang_github_tracker.py | 235 +++++++++ .../commands/run_clang_github_tracker.py | 447 +++++++----------- .../migrations/0001_initial.py | 78 +++ clang_github_tracker/migrations/__init__.py | 0 clang_github_tracker/models.py | 40 ++ .../preprocessors/issue_preprocessor.py | 75 ++- .../preprocessors/pr_preprocessor.py | 75 ++- clang_github_tracker/services.py | 74 +++ clang_github_tracker/state_manager.py | 310 ++---------- clang_github_tracker/sync_raw.py | 111 ++--- clang_github_tracker/tests/test_backfill.py | 79 ++++ clang_github_tracker/tests/test_commands.py | 99 ++-- .../tests/test_preprocessors.py | 84 ++++ clang_github_tracker/tests/test_services.py | 42 ++ .../tests/test_state_manager.py | 87 +++- clang_github_tracker/workspace.py | 22 +- config/test_settings.py | 1 + docs/Pinecone_preprocess_guideline.md | 12 + docs/Schema.md | 15 + docs/Workspace.md | 4 +- docs/service_api/README.md | 2 + docs/service_api/clang_github_tracker.md | 35 ++ 23 files changed, 1228 insertions(+), 706 deletions(-) create mode 100644 clang_github_tracker/apps.py create mode 100644 clang_github_tracker/management/commands/backfill_clang_github_tracker.py create mode 100644 clang_github_tracker/migrations/0001_initial.py create mode 100644 clang_github_tracker/migrations/__init__.py create mode 100644 clang_github_tracker/models.py create mode 100644 clang_github_tracker/services.py create mode 100644 clang_github_tracker/tests/test_backfill.py create mode 100644 clang_github_tracker/tests/test_preprocessors.py create mode 100644 clang_github_tracker/tests/test_services.py create mode 100644 docs/service_api/clang_github_tracker.md diff --git a/clang_github_tracker/apps.py b/clang_github_tracker/apps.py new file mode 100644 index 0000000..aec5d50 --- /dev/null +++ b/clang_github_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class ClangGithubTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "clang_github_tracker" + verbose_name = "Clang GitHub Tracker" diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py new file mode 100644 index 0000000..e5380e6 --- /dev/null +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -0,0 +1,235 @@ +""" +Backfill ClangGithubIssueItem / ClangGithubCommit from CSV or raw JSON scan. +""" + +from __future__ import annotations + +import csv +import json +import logging +import re +from pathlib import Path + +from django.core.management.base import BaseCommand, CommandError + +from clang_github_tracker import services as clang_services +from clang_github_tracker import state_manager as clang_state +from clang_github_tracker.workspace import ( + OWNER, + REPO, + default_backfill_csv_path, + get_raw_repo_dir, +) +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, + parse_datetime, +) + +logger = logging.getLogger(__name__) + +_SHA40 = re.compile(r"^[0-9a-fA-F]{40}$") + + +def _commit_date_from_json(data: dict): + commit = data.get("commit") or {} + author = commit.get("author") or commit.get("committer") or {} + date_str = author.get("date") + if not date_str: + return None + return parse_datetime(date_str) or clang_state.parse_iso(date_str) + + +class Command(BaseCommand): + help = ( + "Backfill clang_github_tracker DB from CSV (--from-csv) or raw JSON dirs (--from-raw). " + "CSV columns: record_type (issue|pr|commit), number, github_created_at, github_updated_at, " + "sha, github_committed_at." + ) + + def add_arguments(self, parser): + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "--from-csv", + nargs="?", + const="", + default=None, + metavar="PATH", + help=( + "Import from CSV. If PATH is omitted, use workspace/clang_github_tracker/" + "clang_github_tracker_backfill.csv" + ), + ) + group.add_argument( + "--from-raw", + action="store_true", + help="Scan raw/github_activity_tracker///commits|issues|prs/*.json", + ) + + def handle(self, *args, **options): + if options.get("from_raw"): + self._backfill_from_raw() + return + csv_arg = options.get("from_csv") + path = Path(csv_arg) if csv_arg else default_backfill_csv_path() + self._backfill_from_csv(path) + + def _backfill_from_csv(self, path: Path) -> None: + if not path.is_file(): + raise CommandError(f"CSV not found: {path}") + inserted = updated = skipped = 0 + with path.open(encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + if not reader.fieldnames: + raise CommandError("CSV has no header row") + for row in reader: + rt = (row.get("record_type") or "").strip().lower() + try: + if rt == "issue": + num = int((row.get("number") or "").strip()) + gc = parse_datetime((row.get("github_created_at") or "").strip()) + gu = parse_datetime((row.get("github_updated_at") or "").strip()) + _, was_created = clang_services.upsert_issue_item( + num, + is_pull_request=False, + github_created_at=gc, + github_updated_at=gu, + ) + inserted += bool(was_created) + updated += not was_created + elif rt == "pr": + num = int((row.get("number") or "").strip()) + gc = parse_datetime((row.get("github_created_at") or "").strip()) + gu = parse_datetime((row.get("github_updated_at") or "").strip()) + _, was_created = clang_services.upsert_issue_item( + num, + is_pull_request=True, + github_created_at=gc, + github_updated_at=gu, + ) + inserted += bool(was_created) + updated += not was_created + elif rt == "commit": + sha = (row.get("sha") or "").strip() + if not _SHA40.match(sha): + logger.warning("skip commit row: invalid sha %r", sha) + skipped += 1 + continue + gcm = parse_datetime( + (row.get("github_committed_at") or "").strip() + ) + _, was_created = clang_services.upsert_commit( + sha, github_committed_at=gcm + ) + inserted += bool(was_created) + updated += not was_created + else: + logger.warning("skip row: unknown record_type %r", rt) + skipped += 1 + except (TypeError, ValueError) as e: + logger.warning("skip row: %s (row=%r)", e, row) + skipped += 1 + logger.info( + "CSV backfill done: inserted=%s updated=%s skipped=%s path=%s", + inserted, + updated, + skipped, + path, + ) + + def _backfill_from_raw(self) -> None: + root = get_raw_repo_dir(OWNER, REPO, create=False) + if not root.is_dir(): + raise CommandError(f"Raw repo dir missing: {root}") + + commits_dir = root / "commits" + if commits_dir.is_dir(): + c_ins = c_upd = c_skip = 0 + for p in commits_dir.glob("*.json"): + try: + data = json.loads(p.read_text(encoding="utf-8")) + sha = (data.get("sha") or "").strip() + if not _SHA40.match(sha): + c_skip += 1 + continue + dt = _commit_date_from_json(data) + _, was_created = clang_services.upsert_commit( + sha, github_committed_at=dt + ) + if was_created: + c_ins += 1 + else: + c_upd += 1 + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip commit file %s: %s", p, e) + c_skip += 1 + logger.info( + "raw commits/: inserted=%s updated=%s skipped=%s", + c_ins, + c_upd, + c_skip, + ) + + issues_dir = root / "issues" + if issues_dir.is_dir(): + i_ins = i_upd = i_skip = 0 + for p in issues_dir.glob("*.json"): + try: + data = json.loads(p.read_text(encoding="utf-8")) + flat = normalize_issue_json(data) + num = flat.get("number") + if not isinstance(num, int) or num <= 0: + i_skip += 1 + continue + _, was_created = clang_services.upsert_issue_item( + num, + is_pull_request=False, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + if was_created: + i_ins += 1 + else: + i_upd += 1 + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip issue file %s: %s", p, e) + i_skip += 1 + logger.info( + "raw issues/: inserted=%s updated=%s skipped=%s", + i_ins, + i_upd, + i_skip, + ) + + prs_dir = root / "prs" + if prs_dir.is_dir(): + p_ins = p_upd = p_skip = 0 + for p in prs_dir.glob("*.json"): + try: + data = json.loads(p.read_text(encoding="utf-8")) + flat = normalize_pr_json(data) + num = flat.get("number") + if not isinstance(num, int) or num <= 0: + p_skip += 1 + continue + _, was_created = clang_services.upsert_issue_item( + num, + is_pull_request=True, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + if was_created: + p_ins += 1 + else: + p_upd += 1 + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("skip pr file %s: %s", p, e) + p_skip += 1 + logger.info( + "raw prs/: inserted=%s updated=%s skipped=%s", + p_ins, + p_upd, + p_skip, + ) + + logger.info("raw backfill finished root=%s", root) diff --git a/clang_github_tracker/management/commands/run_clang_github_tracker.py b/clang_github_tracker/management/commands/run_clang_github_tracker.py index b612fa3..64af311 100644 --- a/clang_github_tracker/management/commands/run_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/run_clang_github_tracker.py @@ -1,28 +1,22 @@ """ Management command: run_clang_github_tracker -Fetches GitHub activity for llvm/llvm-project and saves only to -raw/github_activity_tracker/llvm/llvm-project (no DB writes). - -State (last commit/issue/PR dates) is stored in workspace/clang_github_activity/state.json. -If state is missing, it is created by scanning existing raw files or with nulls then scraping. - -After sync, updated issues/PRs are exported as Markdown and pushed to the private repo -configured via CLANG_GITHUB_TRACKER_PRIVATE_REPO_* settings. +Fetches GitHub activity for llvm/llvm-project, saves raw JSON and DB rows, optionally +exports Markdown and pushes to the private repo. Resume uses DB watermarks (not state.json). """ import logging -import os -from datetime import datetime, timezone from pathlib import Path from django.conf import settings from django.core.management import call_command from django.core.management.base import BaseCommand, CommandError +from core.utils.datetime_parsing import parse_iso_datetime from clang_github_tracker import state_manager as clang_state from clang_github_tracker.sync_raw import sync_raw_only from clang_github_tracker.workspace import OWNER, REPO, get_workspace_root + from github_ops import get_github_token, upload_folder_to_github from operations.md_ops.github_export import ( detect_renames_from_dirs, @@ -32,7 +26,6 @@ logger = logging.getLogger(__name__) DEFAULT_PRIVATE_MD_BRANCH = "master" -PINECONE_NAMESPACE_ENV_KEY = "CLANG_GITHUB_PINECONE_NAMESPACE" def _run_pinecone_sync( @@ -40,12 +33,13 @@ def _run_pinecone_sync( ) -> None: """Trigger run_cppa_pinecone_sync if app_type and namespace are both set.""" if not app_type: - logger.warning("Pinecone sync skipped: --pinecone-app-type is empty.") + logger.warning( + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_APP_TYPE is empty (settings/env)." + ) return if not namespace: logger.warning( - "Pinecone sync skipped: namespace is empty (set --pinecone-namespace or %s).", - PINECONE_NAMESPACE_ENV_KEY, + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_NAMESPACE is empty (settings/env)." ) return try: @@ -68,282 +62,182 @@ def _run_pinecone_sync( class Command(BaseCommand): - """Django management command: fetch GitHub activity to raw and optionally run sync.""" + """Django management command: fetch GitHub activity to raw + DB; optional MD, push, Pinecone.""" help = ( "Run Clang GitHub Tracker: fetch llvm/llvm-project activity to " - "raw/github_activity_tracker only (no DB). Uses workspace/clang_github_activity/state.json for resume." + "raw/github_activity_tracker and DB. Uses DB cursor for resume (not state.json). " + "Use --skip-* to skip steps; default runs all." ) def add_arguments(self, parser): - """Register --dry-run, --from-date, --to-date, --no-upload, --pinecone-app-type, --pinecone-namespace.""" parser.add_argument( "--dry-run", action="store_true", - help="Only show resolved start/end dates and state; do not fetch.", + help="No sync, export, push, or Pinecone writes; resolved windows logged at INFO.", ) parser.add_argument( - "--from-date", - type=str, - default=None, - help="Start date for sync (ISO format: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS). Default: from state or raw scan.", + "--skip-github-sync", + action="store_true", + help="Skip API fetch / sync_raw_only.", ) parser.add_argument( - "--to-date", - type=str, - default=None, - help="End date for sync (ISO format). Default: now.", + "--skip-markdown-export", + action="store_true", + help="Skip writing .md files from this run's sync results.", ) parser.add_argument( - "--no-upload", + "--skip-remote-push", action="store_true", - help="Generate Markdown files but skip pushing to GitHub (useful for inspection).", + help="Skip push to CLANG_GITHUB_TRACKER_PRIVATE_REPO_*.", ) parser.add_argument( - "--upload-only", + "--skip-pinecone", action="store_true", - help="Only upload existing MD files from workspace (no sync, no MD generation).", + help="Skip run_cppa_pinecone_sync for issues and PRs.", ) parser.add_argument( - "--pinecone-app-type", + "--since", + "--from-date", + "--start-time", type=str, - default=settings.CLANG_GITHUB_PINECONE_APP_TYPE, - help="App type passed to run_cppa_pinecone_sync. Default from env CLANG_GITHUB_PINECONE_APP_TYPE.", + default=None, + dest="since", + help="Sync window start: YYYY-MM-DD or ISO-8601. " + "--from-date / --start-time are aliases for --since.", ) parser.add_argument( - "--pinecone-namespace", + "--until", + "--to-date", + "--end-time", type=str, - default=settings.CLANG_GITHUB_PINECONE_NAMESPACE, - help=f"Pinecone namespace for sync. Default from env {PINECONE_NAMESPACE_ENV_KEY}.", + default=None, + dest="until", + help="Sync window end: same formats as --since. " + "--to-date / --end-time are aliases for --until.", ) def handle(self, *args, **options): - """Resolve dates from state or CLI, then run sync unless --dry-run or --upload-only.""" dry_run = options["dry_run"] - no_upload = options.get("no_upload", False) - upload_only = options.get("upload_only", False) - from_date_str = (options.get("from_date") or "").strip() - to_date_str = (options.get("to_date") or "").strip() - pinecone_app_type = ( - options.get("pinecone_app_type") or "" - ).strip() or settings.CLANG_GITHUB_PINECONE_APP_TYPE - pinecone_namespace = ( - options.get("pinecone_namespace") or "" - ).strip() or settings.CLANG_GITHUB_PINECONE_NAMESPACE - - if upload_only: - self._upload_md_only(dry_run=dry_run) - return - - from_date = None - to_date = None - if from_date_str: - try: - from_date = datetime.fromisoformat(from_date_str) - except ValueError as e: - logger.warning("Invalid --from-date: %s", e) - if to_date_str: - try: - to_date = datetime.fromisoformat(to_date_str) - except ValueError as e: - logger.warning("Invalid --to-date: %s", e) + skip_github_sync = options["skip_github_sync"] + skip_markdown_export = options["skip_markdown_export"] + skip_remote_push = options["skip_remote_push"] + skip_pinecone = options["skip_pinecone"] - # Normalize to UTC for comparison - if from_date and from_date.tzinfo is None: - from_date = from_date.replace(tzinfo=timezone.utc) - elif from_date: - from_date = from_date.astimezone(timezone.utc) - if to_date and to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) - elif to_date: - to_date = to_date.astimezone(timezone.utc) - - if from_date and to_date and from_date > to_date: - raise CommandError( - "Invalid date range: from_date must be before or equal to to_date." - ) - - resolved = clang_state.resolve_start_end_dates(from_date, to_date) - if resolved is None: - return + try: + since = parse_iso_datetime(options.get("since")) + until = parse_iso_datetime(options.get("until")) + except ValueError as e: + raise CommandError(str(e)) from e - start_commit, start_issue, start_pr, end_date = resolved + start_commit, start_item, end_date = clang_state.resolve_start_end_dates( + since, until + ) logger.info( - "Resolved: start_commit=%r start_issue=%r start_pr=%r end=%r", + "Resolved: start_commit=%r start_item=%r end=%r", start_commit, - start_issue, - start_pr, + start_item, end_date, ) + if dry_run: - logger.info("Dry run: no fetch performed.") + if not skip_github_sync: + logger.info("dry-run: would run GitHub sync for llvm/llvm-project") + else: + logger.info("dry-run: skipping GitHub sync (--skip-github-sync)") + if not skip_markdown_export: + logger.info("dry-run: would export Markdown for issues/PRs from sync") + if not skip_remote_push: + logger.info("dry-run: would push Markdown to private repo") + if not skip_pinecone: + logger.info("dry-run: would run Pinecone upsert for issues and PRs") + logger.info("dry-run finished") return - try: - commits_saved, issue_numbers, pr_numbers = sync_raw_only( - start_commit=start_commit, - start_issue=start_issue, - start_pr=start_pr, - end_date=end_date, - ) - logger.info( - "run_clang_github_tracker: saved commits=%s issues=%s prs=%s", - commits_saved, - len(issue_numbers), - len(pr_numbers), - ) - except Exception as e: - logger.exception("run_clang_github_tracker failed: %s", e) - raise + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] - if not issue_numbers and not pr_numbers: - logger.info( - "run_clang_github_tracker: no issues/PRs synced; skipping MD export." - ) - return + if not skip_github_sync: + try: + commits_saved, issue_numbers, pr_numbers = sync_raw_only( + start_commit=start_commit, + start_item=start_item, + end_date=end_date, + ) + logger.info( + "run_clang_github_tracker: sync done; commits=%s issues=%s prs=%s", + commits_saved, + len(issue_numbers), + len(pr_numbers), + ) + except Exception as e: + logger.exception("run_clang_github_tracker sync failed: %s", e) + raise + else: + logger.info("skipping GitHub sync (--skip-github-sync)") md_output_dir = get_workspace_root() / "md_export" md_output_dir.mkdir(parents=True, exist_ok=True) - self.stdout.write(f"Writing MD to {md_output_dir}") - - try: - new_files = write_md_files( - owner=OWNER, - repo=REPO, - issue_numbers=issue_numbers, - pr_numbers=pr_numbers, - output_dir=md_output_dir, - folder_prefix="", - ) - logger.info( - "run_clang_github_tracker: generated %s MD file(s).", - len(new_files), - ) - if not new_files: - logger.info( - "run_clang_github_tracker: no MD files generated; skipping upload." + new_files: dict[str, str] = {} + if not skip_markdown_export: + if issue_numbers or pr_numbers: + logger.info("writing MD to %s", md_output_dir) + new_files = write_md_files( + owner=OWNER, + repo=REPO, + issue_numbers=issue_numbers, + pr_numbers=pr_numbers, + output_dir=md_output_dir, + folder_prefix="", ) - return - - if no_upload: logger.info( - "run_clang_github_tracker: --no-upload set; skipping GitHub push." + "run_clang_github_tracker: generated %s MD file(s).", + len(new_files), ) - return - - private_owner = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" - ).strip() - private_repo_name = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", "" - ).strip() - private_branch = ( - getattr( - settings, - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", - DEFAULT_PRIVATE_MD_BRANCH, - ) - or DEFAULT_PRIVATE_MD_BRANCH - ).strip() - if not private_owner or not private_repo_name: - logger.error( - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER / CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME " - "not configured; skipping upload." - ) - return - - token = get_github_token(use="write") - delete_paths = detect_renames_from_dirs( - private_owner, - private_repo_name, - private_branch, - new_files, - token=token, - ) - for repo_rel in delete_paths: - stale_local = md_output_dir / repo_rel - if stale_local.exists(): - stale_local.unlink() - if delete_paths: + elif skip_github_sync: + logger.info("skipped Markdown export (no sync in this run)") + else: logger.info( - "run_clang_github_tracker: %s renamed file(s) to delete.", - len(delete_paths), + "run_clang_github_tracker: no issues/PRs synced; skipping MD export." ) + else: + logger.info("skipping Markdown export (--skip-markdown-export)") - result = upload_folder_to_github( - local_folder=md_output_dir, - owner=private_owner, - repo=private_repo_name, - commit_message="chore: update Clang issues/PRs markdown", - branch=private_branch, - delete_paths=delete_paths or None, - ) - - if result.get("success"): - logger.info("run_clang_github_tracker: MD upload complete.") - for local_path in new_files.values(): - Path(local_path).unlink(missing_ok=True) + if not skip_remote_push: + if not new_files: + if skip_markdown_export and not skip_github_sync: + logger.warning( + "nothing new to push (--skip-markdown-export); skipping remote push" + ) + elif skip_github_sync: + logger.warning("nothing to push from this run (sync was skipped)") + elif not issue_numbers and not pr_numbers: + logger.info("no MD files to push (no issues/PRs in sync)") else: - msg = result.get("message") or "Upload failed" - logger.error("run_clang_github_tracker: MD upload failed: %s", msg) - raise CommandError(msg) - except Exception as e: - logger.exception("run_clang_github_tracker: MD export/upload failed: %s", e) - raise - - # Phase: upsert issues and PRs to Pinecone - effective_app_type = ( - pinecone_app_type or settings.CLANG_GITHUB_PINECONE_APP_TYPE - ) - effective_namespace = ( - pinecone_namespace or settings.CLANG_GITHUB_PINECONE_NAMESPACE - ) - _run_pinecone_sync( - f"{effective_app_type}-issues", - effective_namespace, - "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", - ) - _run_pinecone_sync( - f"{effective_app_type}-prs", - effective_namespace, - "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", - ) - - def _upload_md_only(self, *, dry_run: bool = False): - """Upload existing MD files from workspace/clang_github_activity/md_export (no sync, no generation).""" - if dry_run: - logger.info( - "run_clang_github_tracker: --upload-only with --dry-run; skipping upload." + self._push_markdown(md_output_dir, new_files) + else: + logger.info("skipping remote push (--skip-remote-push)") + + if not skip_pinecone: + app_type = (settings.CLANG_GITHUB_PINECONE_APP_TYPE or "").strip() + namespace = (settings.CLANG_GITHUB_PINECONE_NAMESPACE or "").strip() + _run_pinecone_sync( + f"{app_type}-issues", + namespace, + "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", ) - return - md_output_dir = get_workspace_root() / "md_export" - if not md_output_dir.is_dir(): - self.stdout.write( - self.style.WARNING( - f"No md_export folder at {md_output_dir}; nothing to upload." - ) + _run_pinecone_sync( + f"{app_type}-prs", + namespace, + "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", ) - return + else: + logger.info("skipping Pinecone (--skip-pinecone)") - new_files = {} - for root, _dirs, files in os.walk(md_output_dir): - for name in files: - if not name.endswith(".md"): - continue - path = Path(root) / name - repo_rel = path.relative_to(md_output_dir).as_posix() - new_files[repo_rel] = str(path) - - if not new_files: - self.stdout.write( - self.style.WARNING("No .md files in md_export; nothing to upload.") - ) - return - - self.stdout.write(f"Writing MD to {md_output_dir}") - self.stdout.write(f"Found {len(new_files)} .md file(s) to upload.") + logger.info("run_clang_github_tracker finished successfully") + def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None: private_owner = getattr( settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" ).strip() @@ -358,60 +252,45 @@ def _upload_md_only(self, *, dry_run: bool = False): ) or DEFAULT_PRIVATE_MD_BRANCH ).strip() - if not private_owner or not private_repo_name: logger.error( "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER / CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME " - "not configured." - ) - self.stdout.write( - self.style.ERROR( - "Private repo not configured; set CLANG_GITHUB_TRACKER_PRIVATE_REPO_*." - ) + "not configured; skipping upload." ) return - try: - token = get_github_token(use="write") - delete_paths = detect_renames_from_dirs( - private_owner, - private_repo_name, - private_branch, - new_files, - token=token, + token = get_github_token(use="write") + delete_paths = detect_renames_from_dirs( + private_owner, + private_repo_name, + private_branch, + new_files, + token=token, + ) + for repo_rel in delete_paths: + stale_local = md_output_dir / repo_rel + if stale_local.exists(): + stale_local.unlink() + if delete_paths: + logger.info( + "run_clang_github_tracker: %s renamed file(s) to delete.", + len(delete_paths), ) - for repo_rel in delete_paths: - stale_local = md_output_dir / repo_rel - if stale_local.exists(): - stale_local.unlink() - if delete_paths: - logger.info( - "run_clang_github_tracker: %s renamed file(s) to delete.", - len(delete_paths), - ) - result = upload_folder_to_github( - local_folder=md_output_dir, - owner=private_owner, - repo=private_repo_name, - commit_message="chore: update Clang issues/PRs markdown", - branch=private_branch, - delete_paths=delete_paths or None, - ) + result = upload_folder_to_github( + local_folder=md_output_dir, + owner=private_owner, + repo=private_repo_name, + commit_message="chore: update Clang issues/PRs markdown", + branch=private_branch, + delete_paths=delete_paths or None, + ) - if result.get("success"): - self.stdout.write(self.style.SUCCESS("MD upload complete.")) - logger.info("run_clang_github_tracker: MD upload complete.") - for local_path in new_files.values(): - Path(local_path).unlink(missing_ok=True) - else: - msg = result.get("message") or "Upload failed" - self.stdout.write(self.style.ERROR(f"Upload failed: {msg}")) - logger.error( - "run_clang_github_tracker: MD upload failed: %s", - msg, - ) - raise CommandError(msg) - except Exception as e: - logger.exception("run_clang_github_tracker: upload-only failed: %s", e) - raise + if result.get("success"): + logger.info("run_clang_github_tracker: MD upload complete.") + for local_path in new_files.values(): + Path(local_path).unlink(missing_ok=True) + else: + msg = result.get("message") or "Upload failed" + logger.error("run_clang_github_tracker: MD upload failed: %s", msg) + raise CommandError(msg) diff --git a/clang_github_tracker/migrations/0001_initial.py b/clang_github_tracker/migrations/0001_initial.py new file mode 100644 index 0000000..75b3863 --- /dev/null +++ b/clang_github_tracker/migrations/0001_initial.py @@ -0,0 +1,78 @@ +# Generated manually for clang_github_tracker models + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ClangGithubCommit", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("sha", models.CharField(max_length=40, unique=True)), + ( + "github_committed_at", + models.DateTimeField(blank=True, db_index=True, null=True), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "db_table": "clang_github_tracker_commit", + }, + ), + migrations.CreateModel( + name="ClangGithubIssueItem", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("number", models.PositiveIntegerField(unique=True)), + ("is_pull_request", models.BooleanField(default=False)), + ( + "github_created_at", + models.DateTimeField(blank=True, null=True), + ), + ( + "github_updated_at", + models.DateTimeField( + blank=True, + db_index=True, + help_text="GitHub API updated_at; drives fetch watermarks.", + null=True, + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + db_index=True, + help_text="Last DB save; drives Pinecone incrementality vs final_sync_at.", + ), + ), + ], + options={ + "db_table": "clang_github_tracker_issue_item", + }, + ), + ] diff --git a/clang_github_tracker/migrations/__init__.py b/clang_github_tracker/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/clang_github_tracker/models.py b/clang_github_tracker/models.py new file mode 100644 index 0000000..bcbb3cb --- /dev/null +++ b/clang_github_tracker/models.py @@ -0,0 +1,40 @@ +"""Database models for clang_github_tracker (no FKs to other apps).""" + +from __future__ import annotations + +from django.db import models + + +class ClangGithubIssueItem(models.Model): + """One row per GitHub issue or PR number for the configured llvm repo.""" + + number = models.PositiveIntegerField(unique=True) + is_pull_request = models.BooleanField(default=False) + github_created_at = models.DateTimeField(null=True, blank=True) + github_updated_at = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text="GitHub API updated_at; drives fetch watermarks.", + ) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField( + auto_now=True, + db_index=True, + help_text="Last DB save; drives Pinecone incrementality vs final_sync_at.", + ) + + class Meta: + db_table = "clang_github_tracker_issue_item" + + +class ClangGithubCommit(models.Model): + """One row per commit SHA synced for the configured llvm repo.""" + + sha = models.CharField(max_length=40, unique=True) + github_committed_at = models.DateTimeField(null=True, blank=True, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + db_table = "clang_github_tracker_commit" diff --git a/clang_github_tracker/preprocessors/issue_preprocessor.py b/clang_github_tracker/preprocessors/issue_preprocessor.py index 966b26a..aaedf21 100644 --- a/clang_github_tracker/preprocessors/issue_preprocessor.py +++ b/clang_github_tracker/preprocessors/issue_preprocessor.py @@ -1,24 +1,29 @@ """ Pinecone issue preprocessor for clang_github_tracker. -Wraps github_activity_tracker.preprocessors.github_preprocess.preprocess_issues -for the llvm/llvm-project repo (configured via CLANG_GITHUB_OWNER / CLANG_GITHUB_REPO). - -Usage (via run_cppa_pinecone_sync or run_clang_github_tracker): - app_type = APP_TYPE (default: "github-clang", override with CLANG_GITHUB_PINECONE_APP_TYPE env) - namespace = NAMESPACE ("github-clang") - preprocessor = clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone +Selects candidate issue numbers from DB (updated_at vs final_sync_at) plus failed_ids retries, +then builds documents from raw JSON via github_preprocess.build_issue_document. """ from __future__ import annotations +import json +import logging import os +import re from datetime import datetime from typing import Any from django.conf import settings +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from github_activity_tracker.preprocessors.github_preprocess import build_issue_document +from github_activity_tracker.workspace import get_raw_source_issue_path -from github_activity_tracker.preprocessors.github_preprocess import preprocess_issues +logger = logging.getLogger(__name__) + +_ISSUE_ID_SUFFIX = re.compile(r":issue:(\d+)$") NAMESPACE = "github-clang" APP_TYPE = os.getenv("CLANG_GITHUB_PINECONE_APP_TYPE", NAMESPACE) @@ -28,18 +33,42 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, ) -> tuple[list[dict[str, Any]], bool]: - """Preprocess clang GitHub issues for Pinecone upsert. - - Args: - failed_ids: Previously failed ids strings to retry. - final_sync_at: Last successful sync timestamp; None means first run. - - Returns: - (documents, is_chunked=False) - """ - return preprocess_issues( - settings.CLANG_GITHUB_OWNER, - settings.CLANG_GITHUB_REPO, - failed_ids, - final_sync_at, - ) + """Preprocess clang GitHub issues for Pinecone upsert.""" + owner = settings.CLANG_GITHUB_OWNER + repo = settings.CLANG_GITHUB_REPO + + if final_sync_at is None: + qs = ClangGithubIssueItem.objects.filter(is_pull_request=False).values_list( + "number", flat=True + ) + else: + fs = final_sync_at + if timezone.is_naive(fs): + fs = timezone.make_aware(fs, timezone.utc) + qs = ClangGithubIssueItem.objects.filter( + is_pull_request=False, updated_at__gt=fs + ).values_list("number", flat=True) + + numbers: set[int] = set(int(n) for n in qs) + + for fid in failed_ids: + m = _ISSUE_ID_SUFFIX.search(fid or "") + if m: + numbers.add(int(m.group(1))) + + documents: list[dict[str, Any]] = [] + for number in sorted(numbers): + path = get_raw_source_issue_path(owner, repo, number) + if not path.is_file(): + logger.debug("preprocess issue #%s: raw missing %s", number, path) + continue + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("preprocess issue #%s: read failed %s", number, e) + continue + doc = build_issue_document(path, data, repo) + if doc: + documents.append(doc) + + return documents, False diff --git a/clang_github_tracker/preprocessors/pr_preprocessor.py b/clang_github_tracker/preprocessors/pr_preprocessor.py index bd9f0c9..06f6aab 100644 --- a/clang_github_tracker/preprocessors/pr_preprocessor.py +++ b/clang_github_tracker/preprocessors/pr_preprocessor.py @@ -1,24 +1,29 @@ """ Pinecone PR preprocessor for clang_github_tracker. -Wraps github_activity_tracker.preprocessors.github_preprocess.preprocess_prs -for the llvm/llvm-project repo (configured via CLANG_GITHUB_OWNER / CLANG_GITHUB_REPO). - -Usage (via run_cppa_pinecone_sync or run_clang_github_tracker): - app_type = APP_TYPE (default: "github-clang", override with CLANG_GITHUB_PINECONE_APP_TYPE env) - namespace = NAMESPACE ("github-clang") - preprocessor = clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone +Selects candidate PR numbers from DB (updated_at vs final_sync_at) plus failed_ids retries, +then builds documents from raw JSON via github_preprocess.build_pr_document. """ from __future__ import annotations +import json +import logging import os +import re from datetime import datetime from typing import Any from django.conf import settings +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from github_activity_tracker.preprocessors.github_preprocess import build_pr_document +from github_activity_tracker.workspace import get_raw_source_pr_path -from github_activity_tracker.preprocessors.github_preprocess import preprocess_prs +logger = logging.getLogger(__name__) + +_PR_ID_SUFFIX = re.compile(r":pr:(\d+)$") NAMESPACE = "github-clang" APP_TYPE = os.getenv("CLANG_GITHUB_PINECONE_APP_TYPE", NAMESPACE) @@ -28,18 +33,42 @@ def preprocess_for_pinecone( failed_ids: list[str], final_sync_at: datetime | None, ) -> tuple[list[dict[str, Any]], bool]: - """Preprocess clang GitHub pull requests for Pinecone upsert. - - Args: - failed_ids: Previously failed ids strings to retry. - final_sync_at: Last successful sync timestamp; None means first run. - - Returns: - (documents, is_chunked=False) - """ - return preprocess_prs( - settings.CLANG_GITHUB_OWNER, - settings.CLANG_GITHUB_REPO, - failed_ids, - final_sync_at, - ) + """Preprocess clang GitHub pull requests for Pinecone upsert.""" + owner = settings.CLANG_GITHUB_OWNER + repo = settings.CLANG_GITHUB_REPO + + if final_sync_at is None: + qs = ClangGithubIssueItem.objects.filter(is_pull_request=True).values_list( + "number", flat=True + ) + else: + fs = final_sync_at + if timezone.is_naive(fs): + fs = timezone.make_aware(fs, timezone.utc) + qs = ClangGithubIssueItem.objects.filter( + is_pull_request=True, updated_at__gt=fs + ).values_list("number", flat=True) + + numbers: set[int] = set(int(n) for n in qs) + + for fid in failed_ids: + m = _PR_ID_SUFFIX.search(fid or "") + if m: + numbers.add(int(m.group(1))) + + documents: list[dict[str, Any]] = [] + for number in sorted(numbers): + path = get_raw_source_pr_path(owner, repo, number) + if not path.is_file(): + logger.debug("preprocess pr #%s: raw missing %s", number, path) + continue + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("preprocess pr #%s: read failed %s", number, e) + continue + doc = build_pr_document(path, data, repo) + if doc: + documents.append(doc) + + return documents, False diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py new file mode 100644 index 0000000..2d43d88 --- /dev/null +++ b/clang_github_tracker/services.py @@ -0,0 +1,74 @@ +"""DB upsert and watermark helpers for clang_github_tracker.""" + +from __future__ import annotations + +import logging +from datetime import datetime, timedelta +from typing import Optional + +from django.db.models import Max + +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem + +logger = logging.getLogger(__name__) + + +def upsert_issue_item( + number: int, + *, + is_pull_request: bool, + github_created_at: datetime | None, + github_updated_at: datetime | None, +) -> tuple[ClangGithubIssueItem, bool]: + """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" + obj, created = ClangGithubIssueItem.objects.update_or_create( + number=number, + defaults={ + "is_pull_request": is_pull_request, + "github_created_at": github_created_at, + "github_updated_at": github_updated_at, + }, + ) + logger.debug( + "clang issue item #%s %s (pr=%s)", + number, + "created" if created else "updated", + is_pull_request, + ) + return obj, created + + +def upsert_commit( + sha: str, + *, + github_committed_at: datetime | None, +) -> tuple[ClangGithubCommit, bool]: + """Create or update a ClangGithubCommit by ``sha``. Returns (instance, created).""" + sha_clean = (sha or "").strip() + if len(sha_clean) != 40: + raise ValueError(f"commit sha must be 40 hex chars, got {sha_clean!r}") + obj, created = ClangGithubCommit.objects.update_or_create( + sha=sha_clean, + defaults={"github_committed_at": github_committed_at}, + ) + logger.debug("clang commit %s %s", sha_clean[:8], "created" if created else "updated") + return obj, created + + +def get_issue_item_watermark() -> Optional[datetime]: + """Max ``github_updated_at`` across issues and PRs (API fetch cursor base).""" + m = ClangGithubIssueItem.objects.aggregate(m=Max("github_updated_at"))["m"] + return m + + +def get_commit_watermark() -> Optional[datetime]: + """Max ``github_committed_at`` across commits (API fetch cursor base).""" + m = ClangGithubCommit.objects.aggregate(m=Max("github_committed_at"))["m"] + return m + + +def start_after_watermark(max_dt: datetime | None) -> datetime | None: + """Return ``max + 1s`` for API fetch lower bound, or ``None`` if no watermark.""" + if max_dt is None: + return None + return max_dt + timedelta(seconds=1) diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index f84675d..436cc64 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -1,25 +1,23 @@ """ -State for clang_github_tracker: last sync dates per entity (commits, issues, PRs). +Date resolution for clang_github_tracker sync windows. -Stored in workspace/clang_github_activity/state.json. -When state file is missing, it can be computed by scanning raw/github_activity_tracker/llvm/llvm-project. +Uses DB watermarks on ClangGithubIssueItem / ClangGithubCommit (not state.json). """ from __future__ import annotations -import json import logging -from datetime import datetime, timezone, timedelta -from pathlib import Path +from datetime import datetime -from clang_github_tracker.workspace import get_state_path, get_raw_repo_dir +from django.utils import timezone -logger = logging.getLogger(__name__) +from clang_github_tracker.services import ( + get_commit_watermark, + get_issue_item_watermark, + start_after_watermark, +) -# Keys in state JSON -KEY_LAST_COMMIT_DATE = "last_commit_date" -KEY_LAST_ISSUE_DATE = "last_issue_date" -KEY_LAST_PR_DATE = "last_pr_date" +logger = logging.getLogger(__name__) def parse_iso(s: str | None) -> datetime | None: @@ -32,263 +30,59 @@ def parse_iso(s: str | None) -> datetime | None: return None -def _to_iso(dt: datetime | None) -> str | None: - """Return datetime as ISO string with Z suffix, or None.""" +def _aware_utc(dt: datetime | None) -> datetime | None: if dt is None: return None - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return dt.isoformat().replace("+00:00", "Z") - - -def load_state() -> dict[str, str | None]: - """ - Load state from workspace/clang_github_activity/state.json. + if timezone.is_naive(dt): + return timezone.make_aware(dt, timezone.utc) + return dt.astimezone(timezone.utc) - Returns: - - {} (empty dict): file missing, invalid, read error, or loaded object is empty/all-None - → invalid; ensure_state_file_exists will recompute from raw. - - Dict with keys last_commit_date, last_issue_date, last_pr_date (values str or None): - valid state file (at least one non-None date); None means no previous sync for that entity. - """ - path = get_state_path() - if not path.exists(): - return {} - try: - data = json.loads(path.read_text(encoding="utf-8")) - if not isinstance(data, dict): - return {} - # Treat empty or all-None as invalid so ensure_state_file_exists can recompute from raw - if not any( - ( - data.get(KEY_LAST_COMMIT_DATE), - data.get(KEY_LAST_ISSUE_DATE), - data.get(KEY_LAST_PR_DATE), - ) - ): - return {} - return { - KEY_LAST_COMMIT_DATE: data.get(KEY_LAST_COMMIT_DATE), - KEY_LAST_ISSUE_DATE: data.get(KEY_LAST_ISSUE_DATE), - KEY_LAST_PR_DATE: data.get(KEY_LAST_PR_DATE), - } - except Exception as e: - logger.warning("Failed to load state from %s: %s", path, e) - return {} - -def save_state( - last_commit_date: datetime | None = None, - last_issue_date: datetime | None = None, - last_pr_date: datetime | None = None, - *, - merge: bool = True, -) -> None: - """Write state to workspace/clang_github_activity/state.json. If merge=True, load existing and update only provided keys.""" - path = get_state_path() - path.parent.mkdir(parents=True, exist_ok=True) - if merge: - current = load_state() - if last_commit_date is not None: - current[KEY_LAST_COMMIT_DATE] = _to_iso(last_commit_date) - if last_issue_date is not None: - current[KEY_LAST_ISSUE_DATE] = _to_iso(last_issue_date) - if last_pr_date is not None: - current[KEY_LAST_PR_DATE] = _to_iso(last_pr_date) - data = current - else: - data = { - KEY_LAST_COMMIT_DATE: _to_iso(last_commit_date), - KEY_LAST_ISSUE_DATE: _to_iso(last_issue_date), - KEY_LAST_PR_DATE: _to_iso(last_pr_date), - } - path.write_text(json.dumps(data, indent=2), encoding="utf-8") - logger.debug("Saved state to %s", path) - - -def _latest_date_from_commit_json(path: Path) -> datetime | None: - """Read a commit JSON file and return the author/committer date, or None.""" - try: - data = json.loads(path.read_text(encoding="utf-8")) - commit = data.get("commit") or {} - author = commit.get("author") or commit.get("committer") or {} - date_str = author.get("date") - return parse_iso(date_str) - except Exception: - return None - - -def _latest_date_from_issue_or_pr_json(path: Path) -> datetime | None: - """Read an issue or PR JSON file and return updated_at or created_at, or None.""" - try: - data = json.loads(path.read_text(encoding="utf-8")) - # Top-level or nested under issue_info / pr_info - for obj in [data, data.get("issue_info"), data.get("pr_info")]: - if not isinstance(obj, dict): - continue - date_str = obj.get("updated_at") or obj.get("created_at") - dt = parse_iso(date_str) - if dt is not None: - return dt - return None - except Exception: - return None - - -def compute_state_from_raw() -> dict[str, str | None]: - """ - Scan raw/github_activity_tracker// for commits, issues, prs - and return state dict with last_commit_date, last_issue_date, last_pr_date (ISO or None). - If the raw folder does not exist, returns all Nones (caller can write state.json from this). - """ - root = get_raw_repo_dir(create=False) - result: dict[str, str | None] = { - KEY_LAST_COMMIT_DATE: None, - KEY_LAST_ISSUE_DATE: None, - KEY_LAST_PR_DATE: None, - } - if not root.is_dir(): - return result - - # Commits - commits_dir = root / "commits" - if commits_dir.is_dir(): - latest_commit: datetime | None = None - for p in commits_dir.glob("*.json"): - dt = _latest_date_from_commit_json(p) - if dt and (latest_commit is None or dt > latest_commit): - latest_commit = dt - result[KEY_LAST_COMMIT_DATE] = _to_iso(latest_commit) - - # Issues - issues_dir = root / "issues" - if issues_dir.is_dir(): - latest_issue: datetime | None = None - for p in issues_dir.glob("*.json"): - dt = _latest_date_from_issue_or_pr_json(p) - if dt and (latest_issue is None or dt > latest_issue): - latest_issue = dt - result[KEY_LAST_ISSUE_DATE] = _to_iso(latest_issue) - - # PRs - prs_dir = root / "prs" - if prs_dir.is_dir(): - latest_pr: datetime | None = None - for p in prs_dir.glob("*.json"): - dt = _latest_date_from_issue_or_pr_json(p) - if dt and (latest_pr is None or dt > latest_pr): - latest_pr = dt - result[KEY_LAST_PR_DATE] = _to_iso(latest_pr) - - return result - - -def ensure_state_file_exists() -> dict[str, str | None]: - """ - If state file does not exist, ensure state.json exists: - - If raw folder exists: compute state from raw and write state.json. - - If raw folder does not exist: write state.json with {last_commit_date: null, last_issue_date: null, last_pr_date: null}. - If state file exists, load and return. If the file content is not a valid object (empty or invalid JSON), retry once by recomputing from raw and overwriting. - - Returns: - - {} (empty dict): error (e.g. write failed after retry). Caller should log and finish. - - Dict with last_commit_date, last_issue_date, last_pr_date (str or None): state exists; use it (None = fetch from beginning). - """ - path = get_state_path() - if path.exists(): - state = load_state() - if state: - return state - # File exists but content is not a valid object (empty or invalid); retry once by recomputing from raw - logger.warning( - "state.json is empty or not a valid object; recomputing from raw once." - ) - computed = compute_state_from_raw() - try: - path.write_text(json.dumps(computed, indent=2), encoding="utf-8") - logger.info( - "Rewrote state file from raw: last_commit=%s last_issue=%s last_pr=%s", - computed.get(KEY_LAST_COMMIT_DATE), - computed.get(KEY_LAST_ISSUE_DATE), - computed.get(KEY_LAST_PR_DATE), - ) - return computed - except OSError as e: - logger.warning("Failed to rewrite state file %s: %s", path, e) - return {} - computed = compute_state_from_raw() - try: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(computed, indent=2), encoding="utf-8") - except OSError as e: - logger.warning( - "Failed to write state file %s: %s; proceeding with no state.", - path, - e, - ) - return {} - logger.info( - "Created state file from raw scan: last_commit=%s last_issue=%s last_pr=%s", - computed.get(KEY_LAST_COMMIT_DATE), - computed.get(KEY_LAST_ISSUE_DATE), - computed.get(KEY_LAST_PR_DATE), - ) - return computed +def _apply_since_floor(cursor_start: datetime | None, since: datetime | None) -> datetime | None: + """Lower bound: max(DB cursor, ``since``) when ``since`` is set; else DB cursor.""" + if since is None: + return cursor_start + s = _aware_utc(since) + if cursor_start is None: + return s + c = _aware_utc(cursor_start) + assert c is not None and s is not None + return max(c, s) def resolve_start_end_dates( - from_date: datetime | None, - to_date: datetime | None, -) -> tuple[datetime | None, datetime | None, datetime | None, datetime] | None: + since: datetime | None, + until: datetime | None, +) -> tuple[datetime | None, datetime | None, datetime]: """ - Resolve start dates for commits, issues, PRs and end_date. + Resolve ``start_commit``, unified ``start_item`` (issues+PRs), and ``end_date``. - - If from_date and to_date are both provided (CLI): use them for all three and for end. - - Else: ensure state file exists (create from raw scan if missing), then use state's - last_*_date + 1s as start per entity, and to_date or now as end. - - Returns: - (start_commit, start_issue, start_pr, end_date) when state is valid. - None when state is {} after one retry — error is logged; caller should finish. + - If both ``since`` and ``until`` are set and ``since <= until``: use ``since`` for + both starts and ``until`` as end. + - If ``since > until``: log warning and ignore both bounds (fall back to DB + now). + - Otherwise: starts from DB max + 1s (or None if empty/null max), with optional + ``since`` as a per-stream floor. End is ``until`` or ``timezone.now()``. """ - if from_date is not None and to_date is not None: - # CLI provided both: use for all - if from_date.tzinfo is None: - from_date = from_date.replace(tzinfo=timezone.utc) - if to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) - return from_date, from_date, from_date, to_date - - state = ensure_state_file_exists() - if not state: - logger.error( - "State unavailable (error reading state.json or raw folder). Cannot resolve dates; exiting." - ) - return None - now = datetime.now(timezone.utc) + since_aware = _aware_utc(since) + until_aware = _aware_utc(until) + + if since_aware is not None and until_aware is not None: + if since_aware > until_aware: + logger.warning( + "invalid date range: since (%s) is after until (%s); using DB cursors and default end", + since_aware, + until_aware, + ) + since_aware, until_aware = None, None + else: + return since_aware, since_aware, until_aware - if to_date is None: - to_date = now - elif to_date.tzinfo is None: - to_date = to_date.replace(tzinfo=timezone.utc) + end_date = until_aware if until_aware is not None else timezone.now() - def start_from_state(key: str) -> datetime | None: - s = state.get(key) - dt = parse_iso(s) if isinstance(s, str) else None - if dt is None: - return None - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return dt + timedelta(seconds=1) + item_wm = start_after_watermark(get_issue_item_watermark()) + commit_wm = start_after_watermark(get_commit_watermark()) - start_commit = ( - from_date if from_date is not None else start_from_state(KEY_LAST_COMMIT_DATE) - ) - start_issue = ( - from_date if from_date is not None else start_from_state(KEY_LAST_ISSUE_DATE) - ) - start_pr = ( - from_date if from_date is not None else start_from_state(KEY_LAST_PR_DATE) - ) + start_item = _apply_since_floor(item_wm, since_aware) + start_commit = _apply_since_floor(commit_wm, since_aware) - return start_commit, start_issue, start_pr, to_date + return start_commit, start_item, end_date diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 2ce62f9..90157b7 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -1,7 +1,7 @@ """ -Sync llvm/llvm-project to raw/github_activity_tracker only (no DB). +Sync llvm/llvm-project to raw/github_activity_tracker and clang_github_tracker DB. -Uses github_activity_tracker.fetcher and raw_source; does not call services or persist to DB. +Uses github_activity_tracker.fetcher and raw_source; persists issue/PR/commit rows via services. """ from __future__ import annotations @@ -16,10 +16,16 @@ save_issue_raw_source, save_pr_raw_source, ) +from github_activity_tracker.sync.utils import ( + normalize_issue_json, + normalize_pr_json, + parse_datetime, +) from github_ops import get_github_client from github_ops.client import ConnectionException, RateLimitException from clang_github_tracker import state_manager as clang_state +from clang_github_tracker import services as clang_services from clang_github_tracker.workspace import OWNER, REPO logger = logging.getLogger(__name__) @@ -41,57 +47,25 @@ def _commit_date(commit_data: dict) -> datetime | None: date_str = author.get("date") if not date_str: return None - return clang_state.parse_iso(date_str) - - -def _issue_date(issue_data: dict) -> datetime | None: - """Extract updated_at or created_at from GitHub issue payload. - Fetcher yields {issue_info: , comments: [...]}, so check nested first. - """ - info = issue_data.get("issue_info") or issue_data - date_str = info.get("updated_at") or info.get("created_at") - if not date_str: - return None - return clang_state.parse_iso(date_str) - - -def _pr_date(pr_data: dict) -> datetime | None: - """Extract updated_at or created_at from GitHub PR payload. - Fetcher yields {pr_info: , comments: [...], reviews: [...]}, so check nested first. - """ - info = pr_data.get("pr_info") or pr_data - date_str = info.get("updated_at") or info.get("created_at") - if not date_str: - return None - return clang_state.parse_iso(date_str) + return parse_datetime(date_str) or clang_state.parse_iso(date_str) def sync_raw_only( start_commit: datetime | None = None, - start_issue: datetime | None = None, - start_pr: datetime | None = None, + start_item: datetime | None = None, end_date: Optional[datetime] = None, ) -> tuple[int, list[int], list[int]]: """ - Fetch llvm/llvm-project commits, issues, PRs from GitHub and save only to - raw/github_activity_tracker/llvm/llvm-project. No DB writes. + Fetch llvm/llvm-project commits, issues, PRs from GitHub and save to raw paths + and upsert ``ClangGithubCommit`` / ``ClangGithubIssueItem``. Args: start_commit: Start date for commits (None = from beginning). - start_issue: Issue watermark for the unified issues+PRs fetch (one ``/issues`` - list with both item kinds). ``None`` only means “no issue cursor” when - deriving the shared start: if ``start_pr`` is also ``None``, the unified - fetch runs from the beginning; if ``start_pr`` is set, that timestamp is - used as the single lower bound for the whole list (issues are filtered - by the same window). When both ``start_issue`` and ``start_pr`` are set, - the shared lower bound is the **later** of the two (``max``), so one - GitHub query covers both types from that time forward. - start_pr: PR watermark; same shared-bound semantics as ``start_issue``. + start_item: Single lower bound for unified issues+PRs ``/issues`` fetch. end_date: End date for all (default: now). Returns: - (commits_saved, issue_numbers, pr_numbers) — commit count and lists of - issue/PR numbers saved during this run. + (commits_saved, issue_numbers, pr_numbers). """ from django.utils import timezone as django_tz @@ -101,27 +75,15 @@ def sync_raw_only( end_date = django_tz.now() end_date = _ensure_utc(end_date) start_commit = _ensure_utc(start_commit) - start_issue = _ensure_utc(start_issue) - start_pr = _ensure_utc(start_pr) + start_item = _ensure_utc(start_item) client = get_github_client(use="scraping") commits_saved = 0 issue_numbers: list[int] = [] pr_numbers: list[int] = [] - latest_commit: datetime | None = None - latest_issue: datetime | None = None - latest_pr: datetime | None = None - - # Single lower bound for the unified /issues fetch: later of the two when both - # watermarks exist; otherwise whichever side is initialized (or None if both). - if start_issue and start_pr: - start_item = max(start_issue, start_pr) - else: - start_item = start_issue or start_pr try: - # Commits for commit_data in fetcher.fetch_commits_from_github( client, owner, repo, start_commit, end_date ): @@ -129,13 +91,15 @@ def sync_raw_only( if sha: save_commit_raw_source(owner, repo, commit_data) commits_saved += 1 - dt = _commit_date(commit_data) - if dt and (latest_commit is None or dt > latest_commit): - latest_commit = dt - if latest_commit is not None: - clang_state.save_state(last_commit_date=latest_commit, merge=True) + committed_at = _commit_date(commit_data) + try: + clang_services.upsert_commit( + str(sha).strip(), + github_committed_at=committed_at, + ) + except ValueError as e: + logger.warning("skip commit DB upsert: %s", e) - # Issues and PRs — fetched together via a single /issues list call. for item in fetcher.fetch_issues_and_prs_from_github( client, owner, repo, start_item, end_date ): @@ -144,9 +108,15 @@ def sync_raw_only( if pr_number is not None: save_pr_raw_source(owner, repo, item) pr_numbers.append(pr_number) - dt = _pr_date(item) - if dt and (latest_pr is None or dt > latest_pr): - latest_pr = dt + flat = normalize_pr_json(item) + num = flat.get("number") + if isinstance(num, int) and num > 0: + clang_services.upsert_issue_item( + num, + is_pull_request=True, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) else: issue_number = (item.get("issue_info") or {}).get("number") or item.get( "number" @@ -154,14 +124,15 @@ def sync_raw_only( if issue_number is not None: save_issue_raw_source(owner, repo, item) issue_numbers.append(issue_number) - dt = _issue_date(item) - if dt and (latest_issue is None or dt > latest_issue): - latest_issue = dt - - if latest_issue is not None: - clang_state.save_state(last_issue_date=latest_issue, merge=True) - if latest_pr is not None: - clang_state.save_state(last_pr_date=latest_pr, merge=True) + flat = normalize_issue_json(item) + num = flat.get("number") + if isinstance(num, int) and num > 0: + clang_services.upsert_issue_item( + num, + is_pull_request=False, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) except (ConnectionException, RateLimitException) as e: logger.exception("clang_github_tracker sync failed: %s", e) diff --git a/clang_github_tracker/tests/test_backfill.py b/clang_github_tracker/tests/test_backfill.py new file mode 100644 index 0000000..74f25a4 --- /dev/null +++ b/clang_github_tracker/tests/test_backfill.py @@ -0,0 +1,79 @@ +"""Tests for backfill_clang_github_tracker.""" + +import json +from pathlib import Path + +import pytest +from django.core.management import call_command + +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem +from clang_github_tracker.workspace import OWNER, REPO + + +@pytest.mark.django_db +def test_backfill_csv(tmp_path): + csv_path = tmp_path / "b.csv" + csv_path.write_text( + "record_type,number,github_created_at,github_updated_at,sha,github_committed_at\n" + "issue,1,2024-01-01T00:00:00Z,2024-01-02T00:00:00Z,,\n" + "pr,2,,,,\n" + f"commit,,,,{'a' * 40},2024-03-01T00:00:00Z\n", + encoding="utf-8", + ) + call_command("backfill_clang_github_tracker", f"--from-csv={csv_path}") + assert ClangGithubIssueItem.objects.filter(number=1, is_pull_request=False).exists() + assert ClangGithubIssueItem.objects.filter(number=2, is_pull_request=True).exists() + assert ClangGithubCommit.objects.filter(sha="a" * 40).exists() + + +@pytest.mark.django_db +def test_backfill_from_raw(tmp_path, monkeypatch): + root = tmp_path / "raw" / OWNER / REPO + (root / "issues").mkdir(parents=True) + (root / "prs").mkdir(parents=True) + (root / "commits").mkdir(parents=True) + (root / "issues" / "3.json").write_text( + json.dumps( + { + "issue_info": { + "number": 3, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + } + } + ), + encoding="utf-8", + ) + (root / "prs" / "4.json").write_text( + json.dumps( + { + "pr_info": { + "number": 4, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + } + } + ), + encoding="utf-8", + ) + sha = "b" * 40 + (root / "commits" / f"{sha}.json").write_text( + json.dumps( + { + "sha": sha, + "commit": { + "author": {"date": "2024-05-01T00:00:00Z"}, + }, + } + ), + encoding="utf-8", + ) + + monkeypatch.setattr( + "clang_github_tracker.workspace.get_raw_repo_dir", + lambda *a, **k: root, + ) + call_command("backfill_clang_github_tracker", "--from-raw") + assert ClangGithubIssueItem.objects.filter(number=3, is_pull_request=False).exists() + assert ClangGithubIssueItem.objects.filter(number=4, is_pull_request=True).exists() + assert ClangGithubCommit.objects.filter(sha=sha).exists() diff --git a/clang_github_tracker/tests/test_commands.py b/clang_github_tracker/tests/test_commands.py index e6ab17c..95a0ff6 100644 --- a/clang_github_tracker/tests/test_commands.py +++ b/clang_github_tracker/tests/test_commands.py @@ -1,6 +1,5 @@ -"""Tests for clang_github_tracker management command (run_clang_github_tracker).""" +"""Tests for clang_github_tracker management commands.""" -import json import logging import pytest @@ -9,32 +8,39 @@ from django.core.management import call_command -from config.workspace import get_workspace_path - CMD_NAME = "run_clang_github_tracker" @pytest.mark.django_db -def test_run_clang_github_tracker_dry_run_creates_state_if_missing(caplog): - """With --dry-run and no state file, command creates state from raw scan and resolves dates.""" - workspace = get_workspace_path("clang_github_activity") - state_file = workspace / "state.json" - if state_file.exists(): - state_file.unlink() +def test_run_clang_github_tracker_dry_run_logs_resolved(caplog): + """Dry run resolves dates from DB and does not call sync.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only" + ) as sync_mock: + with caplog.at_level(logging.INFO): + call_command(CMD_NAME, "--dry-run", stdout=StringIO(), stderr=StringIO()) + sync_mock.assert_not_called() + assert any("Resolved:" in r.getMessage() for r in caplog.records) + assert any("dry-run" in r.getMessage().lower() for r in caplog.records) + + +@pytest.mark.django_db +def test_run_clang_github_tracker_dry_run_skip_sync(caplog): + """Dry run with --skip-github-sync still logs resolved window.""" with caplog.at_level(logging.INFO): - call_command(CMD_NAME, "--dry-run", stdout=StringIO(), stderr=StringIO()) - assert state_file.exists(), "State file should be created by command" - state = json.loads(state_file.read_text(encoding="utf-8")) - assert "last_commit_date" in state - assert "last_issue_date" in state - assert "last_pr_date" in state + call_command( + CMD_NAME, + "--dry-run", + "--skip-github-sync", + stdout=StringIO(), + stderr=StringIO(), + ) assert any("Resolved:" in r.getMessage() for r in caplog.records) - assert any("Dry run" in r.getMessage() for r in caplog.records) @pytest.mark.django_db -def test_run_clang_github_tracker_dry_run_with_dates(caplog): - """With --from-date and --to-date and --dry-run, command does not call sync.""" +def test_run_clang_github_tracker_since_until_aliases(caplog): + """--from-date/--to-date aliases parse like Boost.""" with patch( "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only" ) as sync_mock: @@ -47,27 +53,60 @@ def test_run_clang_github_tracker_dry_run_with_dates(caplog): stdout=StringIO(), stderr=StringIO(), ) - sync_mock.assert_not_called() - assert any("Resolved:" in r.getMessage() for r in caplog.records) + sync_mock.assert_not_called() + assert any("Resolved:" in r.getMessage() for r in caplog.records) @pytest.mark.django_db def test_run_clang_github_tracker_calls_sync_raw_only_when_not_dry_run(caplog): - """Without --dry-run, command calls sync_raw_only with resolved dates.""" + """Without --dry-run, command calls sync_raw_only with start_item.""" with patch( "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only", - return_value=(0, [], []), # commits_saved, issue_numbers, pr_numbers (lists) + return_value=(0, [], []), ) as sync_mock: with caplog.at_level(logging.INFO): call_command( CMD_NAME, - "--from-date=2024-01-01", - "--to-date=2024-01-02", + "--since=2024-01-01", + "--until=2024-01-02", stdout=StringIO(), stderr=StringIO(), ) - sync_mock.assert_called_once() - call_kw = sync_mock.call_args[1] - assert "start_commit" in call_kw - assert "end_date" in call_kw - assert any("saved commits=" in r.getMessage() for r in caplog.records) + sync_mock.assert_called_once() + call_kw = sync_mock.call_args[1] + assert "start_commit" in call_kw + assert "start_item" in call_kw + assert "end_date" in call_kw + assert "start_issue" not in call_kw + assert any("commits=" in r.getMessage() for r in caplog.records) + + +@pytest.mark.django_db +def test_run_clang_github_tracker_skip_pinecone(caplog): + """--skip-pinecone does not call run_cppa_pinecone_sync.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only", + return_value=(0, [1], []), + ): + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.call_command" + ) as cc: + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + call_command( + CMD_NAME, + "--since=2024-01-01", + "--until=2024-01-02", + "--skip-pinecone", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + pinecone_calls = [ + c + for c in cc.call_args_list + if c[0] and c[0][0] == "run_cppa_pinecone_sync" + ] + assert not pinecone_calls diff --git a/clang_github_tracker/tests/test_preprocessors.py b/clang_github_tracker/tests/test_preprocessors.py new file mode 100644 index 0000000..1e517b6 --- /dev/null +++ b/clang_github_tracker/tests/test_preprocessors.py @@ -0,0 +1,84 @@ +"""Tests for DB-driven clang preprocessors.""" + +from datetime import timedelta +from unittest.mock import patch + +import pytest +from django.utils import timezone + +from clang_github_tracker.models import ClangGithubIssueItem +from clang_github_tracker.preprocessors import issue_preprocessor, pr_preprocessor + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.issue_preprocessor.build_issue_document") +def test_issue_preprocessor_db_and_failed_ids(mock_build, tmp_path, settings): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = {"content": "body", "metadata": {"doc_id": "u", "ids": "x"}} + + p10 = tmp_path / "10.json" + p10.write_text("{}", encoding="utf-8") + + ClangGithubIssueItem.objects.create( + number=10, + is_pull_request=False, + github_updated_at=timezone.now(), + ) + final = timezone.now() - timedelta(hours=1) + + def _issue_path(_owner, _repo, n): + return p10 if n == 10 else tmp_path / f"missing_{n}.json" + + with patch( + "clang_github_tracker.preprocessors.issue_preprocessor.get_raw_source_issue_path", + side_effect=_issue_path, + ): + docs, chunked = issue_preprocessor.preprocess_for_pinecone( + ["llvm-project:issue:99"], final + ) + assert chunked is False + assert mock_build.call_count == 1 + assert len(docs) == 1 + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.issue_preprocessor.build_issue_document") +def test_issue_preprocessor_all_rows_when_final_sync_none(mock_build, tmp_path, settings): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = None + p5 = tmp_path / "5.json" + p5.write_text("{}", encoding="utf-8") + ClangGithubIssueItem.objects.create( + number=5, + is_pull_request=False, + github_updated_at=timezone.now(), + ) + with patch( + "clang_github_tracker.preprocessors.issue_preprocessor.get_raw_source_issue_path", + return_value=p5, + ): + docs, _ = issue_preprocessor.preprocess_for_pinecone([], None) + assert mock_build.call_count == 1 + assert docs == [] + + +@pytest.mark.django_db +@patch("clang_github_tracker.preprocessors.pr_preprocessor.build_pr_document") +def test_pr_preprocessor_failed_id_parsing(mock_build, tmp_path, settings): + settings.CLANG_GITHUB_OWNER = "llvm" + settings.CLANG_GITHUB_REPO = "llvm-project" + mock_build.return_value = {"content": "p", "metadata": {"doc_id": "u", "ids": "y"}} + p7 = tmp_path / "7.json" + p7.write_text("{}", encoding="utf-8") + with patch( + "clang_github_tracker.preprocessors.pr_preprocessor.get_raw_source_pr_path", + return_value=p7, + ): + docs, chunked = pr_preprocessor.preprocess_for_pinecone( + ["llvm-project:pr:7"], timezone.now() + ) + assert chunked is False + assert mock_build.call_count == 1 + assert len(docs) == 1 diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py new file mode 100644 index 0000000..a72d7f2 --- /dev/null +++ b/clang_github_tracker/tests/test_services.py @@ -0,0 +1,42 @@ +"""Tests for clang_github_tracker.services.""" + +from datetime import timedelta + +import pytest +from django.utils import timezone + +from clang_github_tracker import services as clang_services +from clang_github_tracker.models import ClangGithubIssueItem + + +@pytest.mark.django_db +def test_upsert_issue_item_create_and_update_bumps_updated_at(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + _, created = clang_services.upsert_issue_item( + 42, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + assert created is True + row = ClangGithubIssueItem.objects.get(number=42) + first_updated = row.updated_at + + _, created2 = clang_services.upsert_issue_item( + 42, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t1, + ) + assert created2 is False + row.refresh_from_db() + assert row.updated_at > first_updated + assert row.github_updated_at == t1 + + +@pytest.mark.django_db +def test_watermarks_empty(): + assert clang_services.get_issue_item_watermark() is None + assert clang_services.get_commit_watermark() is None + assert clang_services.start_after_watermark(None) is None diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 6e3c36e..fdf9428 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -1,9 +1,12 @@ -"""Tests for clang_github_tracker.state (no DB).""" +"""Tests for clang_github_tracker.state_manager (DB-backed date resolution).""" -from unittest.mock import patch +from datetime import timedelta +import pytest +from django.utils import timezone from clang_github_tracker import state_manager as clang_state +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem def test_parse_iso_valid(): @@ -24,11 +27,75 @@ def test_parse_iso_invalid_or_empty(): assert clang_state.parse_iso("not-a-date") is None -def test_compute_state_from_raw_empty_dir(tmp_path): - """When raw repo dir does not exist, compute_state_from_raw returns nulls.""" - with patch("clang_github_tracker.state_manager.get_raw_repo_dir") as m: - m.return_value = tmp_path / "nonexistent_repo_dir" - result = clang_state.compute_state_from_raw() - assert result[clang_state.KEY_LAST_COMMIT_DATE] is None - assert result[clang_state.KEY_LAST_ISSUE_DATE] is None - assert result[clang_state.KEY_LAST_PR_DATE] is None +@pytest.mark.django_db +def test_resolve_empty_db_no_since_until(): + """Empty tables → None starts; end is now (approximately).""" + ClangGithubIssueItem.objects.all().delete() + ClangGithubCommit.objects.all().delete() + sc, si, end = clang_state.resolve_start_end_dates(None, None) + assert sc is None and si is None + assert end is not None + now = timezone.now() + assert abs((end - now).total_seconds()) < 5 + + +@pytest.mark.django_db +def test_resolve_db_watermark_plus_one_second(): + """Max github fields drive start = max + 1s.""" + base = timezone.now() - timedelta(days=1) + ClangGithubIssueItem.objects.create( + number=1, + is_pull_request=False, + github_created_at=base, + github_updated_at=base, + ) + ClangGithubCommit.objects.create( + sha="a" * 40, + github_committed_at=base, + ) + sc, si, end = clang_state.resolve_start_end_dates(None, None) + assert sc == base + timedelta(seconds=1) + assert si == base + timedelta(seconds=1) + + +@pytest.mark.django_db +def test_resolve_both_since_until_closed_window(): + """Both bounds valid → same since for commit and item; until as end.""" + since = timezone.now() - timedelta(days=10) + until = timezone.now() - timedelta(days=5) + sc, si, end = clang_state.resolve_start_end_dates(since, until) + assert sc == since + assert si == since + assert end == until + + +@pytest.mark.django_db +def test_resolve_invalid_range_clears_bounds(caplog): + """since > until → warning and DB-based resolution.""" + ClangGithubIssueItem.objects.create( + number=99, + is_pull_request=False, + github_updated_at=timezone.now() - timedelta(hours=1), + ) + since = timezone.now() + until = timezone.now() - timedelta(days=1) + with caplog.at_level("WARNING"): + sc, si, end = clang_state.resolve_start_end_dates(since, until) + assert any("invalid date range" in r.getMessage() for r in caplog.records) + assert end is not None + assert sc is not None and si is not None + + +@pytest.mark.django_db +def test_resolve_since_floor_without_until(): + """Only since: starts are max(DB+1s, since).""" + base = timezone.now() - timedelta(days=30) + ClangGithubIssueItem.objects.create( + number=2, + is_pull_request=False, + github_updated_at=base, + ) + since = timezone.now() - timedelta(days=1) + sc, si, _end = clang_state.resolve_start_end_dates(since, None) + assert si is not None + assert si >= since diff --git a/clang_github_tracker/workspace.py b/clang_github_tracker/workspace.py index 349d73e..dc12814 100644 --- a/clang_github_tracker/workspace.py +++ b/clang_github_tracker/workspace.py @@ -1,9 +1,11 @@ """ -Workspace paths for clang_github_tracker: state file and raw GitHub activity dir. +Workspace paths for clang_github_tracker: md export, backfill CSV dir, raw GitHub JSON. Layout: workspace/clang_github_activity/ - - state.json + - md_export/ (generated Markdown for private repo push) + workspace/clang_github_tracker/ + - clang_github_tracker_backfill.csv (default CSV backfill path) workspace/raw/github_activity_tracker/// - commits/, issues/, prs/ """ @@ -16,9 +18,10 @@ from config.workspace import get_workspace_path _APP_SLUG = "clang_github_activity" +_TRACKER_DATA_SLUG = "clang_github_tracker" _RAW_APP_SLUG = "github_activity_tracker" -STATE_FILENAME = "state.json" +DEFAULT_BACKFILL_CSV_NAME = "clang_github_tracker_backfill.csv" # Repo we sync (raw only, no DB); from settings (env: CLANG_GITHUB_OWNER, CLANG_GITHUB_REPO) OWNER = settings.CLANG_GITHUB_OWNER @@ -41,9 +44,16 @@ def get_workspace_root() -> Path: return get_workspace_path(_APP_SLUG) -def get_state_path() -> Path: - """Return workspace/clang_github_activity/state.json. Parent dir created on first write.""" - return get_workspace_root() / STATE_FILENAME +def get_clang_github_tracker_data_dir() -> Path: + """Return workspace/clang_github_tracker/; creates dir if missing.""" + path = get_workspace_path(_TRACKER_DATA_SLUG) + path.mkdir(parents=True, exist_ok=True) + return path + + +def default_backfill_csv_path() -> Path: + """Default path for CSV backfill: workspace/clang_github_tracker/.""" + return get_clang_github_tracker_data_dir() / DEFAULT_BACKFILL_CSV_NAME def get_raw_root() -> Path: diff --git a/config/test_settings.py b/config/test_settings.py index b724fd9..7461925 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -51,6 +51,7 @@ "github_activity_tracker", "boost_library_tracker", "clang_github_activity", + "clang_github_tracker", "discord_activity_tracker", "shared", ): diff --git a/docs/Pinecone_preprocess_guideline.md b/docs/Pinecone_preprocess_guideline.md index c04a2f8..ad43ec4 100644 --- a/docs/Pinecone_preprocess_guideline.md +++ b/docs/Pinecone_preprocess_guideline.md @@ -154,6 +154,18 @@ If no `instance` is specified, **public** is used. --- +## Clang GitHub Tracker (`clang_github_tracker`) + +For **llvm/llvm-project** issues and PRs, `clang_github_tracker.preprocessors.issue_preprocessor` and `pr_preprocessor` **do not** scan all raw JSON files. They: + +1. Select candidate **numbers** from the DB: `ClangGithubIssueItem` rows where `updated_at > final_sync_at` (or **all** rows if `final_sync_at` is `None`), filtered by `is_pull_request`. +2. Union **retry** numbers parsed from `failed_ids` strings (e.g. `…:issue:123`, `…:pr:456`). +3. For each number, read the corresponding raw file under `workspace/raw/github_activity_tracker/...` and build the document with `github_activity_tracker.preprocessors.github_preprocess.build_issue_document` / `build_pr_document`. + +The **`cppa_pinecone_sync`** contract (`preprocess_fn(failed_ids, final_sync_at)`, fail list, sync status) is unchanged; only the clang preprocessors’ **selection** strategy differs from the Boost path. + +--- + ## Summary checklist - [ ] Signature: `(failed_ids: list[str], final_sync_at: datetime | None) -> tuple[list[dict], bool]`. diff --git a/docs/Schema.md b/docs/Schema.md index 98d1034..77c91bd 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -355,6 +355,19 @@ erDiagram --- +### 2b. Clang GitHub Tracker (`clang_github_tracker`) + +Standalone tables for the **llvm/llvm-project** (or `CLANG_GITHUB_OWNER` / `CLANG_GITHUB_REPO`) mirror. **No foreign keys** to other apps. + +| Model | Purpose | +| ----- | ------- | +| **ClangGithubIssueItem** | One row per issue or PR **number** (`unique`). `is_pull_request` distinguishes types. `github_created_at` / `github_updated_at` mirror GitHub API times; **`github_updated_at`** (with `Max` + 1s) drives **API fetch** resume. Django **`updated_at`** (`auto_now`) bumps on every upsert and drives **Pinecone** incrementality vs `PineconeSyncStatus.final_sync_at`. | +| **ClangGithubCommit** | One row per **sha** (`unique`, 40-char hex). `github_committed_at` is the author/committer date used for commit fetch watermarks. | + +Raw JSON remains under `workspace/raw/github_activity_tracker///` (same layout as other raw GitHub activity). + +--- + ### 3. Boost Library Tracker #### Part 1: Boost Library, Headers, and Dependencies @@ -859,6 +872,8 @@ erDiagram | **PullRequestComment** | Comment on a PR. | 2 | | **PullRequestAssignee** | PR-assignee link. | 2 | | **PullRequestLabel** | PR-label name. | 2 | +| **ClangGithubIssueItem** | Clang mirror: one row per issue/PR number (no FKs); GitHub timestamps + Django `updated_at` for Pinecone incrementality. | 2b | +| **ClangGithubCommit** | Clang mirror: one row per commit SHA (no FKs); `github_committed_at` for fetch watermark. | 2b | | **BoostLibraryRepository** | Extends GitHubRepository; adds created_at, updated_at (Boost repos). | 3 | | **BoostLibrary** | Library within a Boost repo (name). | 3 | | **BoostFile** | Extends GitHubFile; adds library_id (file in a Boost library). | 3 | diff --git a/docs/Workspace.md b/docs/Workspace.md index 76a030e..711fe5d 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -20,8 +20,8 @@ workspace/ # WORKSPACE_DIR (configurable via │ │ └── prs/.json │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── /.json -├── clang_github_activity/ # State for clang_github_tracker (last sync dates) -│ └── state.json +├── clang_github_activity/ # Markdown export for clang_github_tracker (md_export/) +├── clang_github_tracker/ # Optional CSV backfill (default: clang_github_tracker_backfill.csv) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── / │ └── messages/.json # Formatted cache (processed then removed) diff --git a/docs/service_api/README.md b/docs/service_api/README.md index 2d47cbb..4a72683 100644 --- a/docs/service_api/README.md +++ b/docs/service_api/README.md @@ -14,6 +14,7 @@ Index of all app service modules. All writes to app models must go through the s | [boost_usage_tracker.services](boost_usage_tracker.md) | boost_usage_tracker | External repos, Boost usage, missing-header tmp. | | [discord_activity_tracker.services](discord_activity_tracker.md) | discord_activity_tracker | Servers, channels, messages, reactions (user profiles in cppa_user_tracker). | | [cppa_youtube_script_tracker.services](cppa_youtube_script_tracker.md) | cppa_youtube_script_tracker | YouTube channels, videos, transcript state, and speaker links for C++ conference talks. | +| [clang_github_tracker.services](clang_github_tracker.md) | clang_github_tracker | Upsert llvm issue/PR/commit rows; DB watermarks for API fetch windows. | --- @@ -27,5 +28,6 @@ Index of all app service modules. All writes to app models must go through the s - **discord_activity_tracker** – Get-or-create DiscordServer, DiscordChannel; create/update DiscordMessage, DiscordReaction. Discord user profiles in cppa_user_tracker. - **cppa_youtube_script_tracker** – Get-or-create YouTubeChannel, YouTubeVideo; update transcript state; link speakers to videos. Speaker profiles (`YoutubeSpeaker`) in cppa_user_tracker. - **cppa_pinecone_sync** – Get/clear/record failed IDs in PineconeFailList; get/update PineconeSyncStatus. +- **clang_github_tracker** – Upsert `ClangGithubIssueItem` / `ClangGithubCommit` during sync or backfill; read `Max(github_updated_at)` / `Max(github_committed_at)` for fetch cursors. See [Contributing.md](../Contributing.md) for the rule that all writes go through the service layer. diff --git a/docs/service_api/clang_github_tracker.md b/docs/service_api/clang_github_tracker.md new file mode 100644 index 0000000..8d6a7ee --- /dev/null +++ b/docs/service_api/clang_github_tracker.md @@ -0,0 +1,35 @@ +# clang_github_tracker.services + +**Module path:** `clang_github_tracker.services` +**Description:** Upserts for `ClangGithubIssueItem` and `ClangGithubCommit` (no FKs to other apps). Used by `sync_raw_only`, `backfill_clang_github_tracker`, and date resolution watermarks. + +**Type notation:** Models live in `clang_github_tracker.models`. + +--- + +## Upserts + +| Function | Parameters | Return | Raises | +| -------- | ---------- | ------ | ------ | +| `upsert_issue_item` | `number: int`, `*, is_pull_request: bool`, `github_created_at`, `github_updated_at` | `tuple[ClangGithubIssueItem, bool]` (instance, created) | — | +| `upsert_commit` | `sha: str`, `*, github_committed_at` | `tuple[ClangGithubCommit, bool]` | `ValueError` if `sha` is not 40 hex chars | + +--- + +## API fetch watermarks + +| Function | Return | Notes | +| -------- | ------ | ----- | +| `get_issue_item_watermark` | `datetime \| None` | `Max(github_updated_at)` over all issue/PR rows (unified issues+PR stream). | +| `get_commit_watermark` | `datetime \| None` | `Max(github_committed_at)` over commits. | +| `start_after_watermark` | `datetime \| None` | `max_dt + timedelta(seconds=1)` or `None` if `max_dt` is `None`. | + +Used by `clang_github_tracker.state_manager.resolve_start_end_dates` (with optional CLI `--since` / `--until` bounds). + +--- + +## Related docs + +- [Schema.md](../Schema.md) – Section 2b: Clang GitHub Tracker. +- [Workspace.md](../Workspace.md) – `workspace/raw/github_activity_tracker/`, `workspace/clang_github_tracker/`. +- [Contributing.md](../Contributing.md) – Service layer rule. From 7d50b2d90ac47f4d52b9697a6e780031cf49a4be Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 1 Apr 2026 00:25:28 -0400 Subject: [PATCH 61/76] Fix: end_date update --- .../commands/run_clang_github_tracker.py | 65 ++++++++++++------- clang_github_tracker/services.py | 10 ++- clang_github_tracker/state_manager.py | 61 +++++++++-------- clang_github_tracker/sync_raw.py | 27 ++++---- clang_github_tracker/tests/test_commands.py | 14 ++-- .../tests/test_state_manager.py | 8 +-- docs/service_api/clang_github_tracker.md | 2 +- 7 files changed, 109 insertions(+), 78 deletions(-) diff --git a/clang_github_tracker/management/commands/run_clang_github_tracker.py b/clang_github_tracker/management/commands/run_clang_github_tracker.py index 64af311..b3d6552 100644 --- a/clang_github_tracker/management/commands/run_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/run_clang_github_tracker.py @@ -14,7 +14,7 @@ from core.utils.datetime_parsing import parse_iso_datetime from clang_github_tracker import state_manager as clang_state -from clang_github_tracker.sync_raw import sync_raw_only +from clang_github_tracker.sync_raw import sync_clang_github_activity from clang_github_tracker.workspace import OWNER, REPO, get_workspace_root from github_ops import get_github_token, upload_folder_to_github @@ -79,7 +79,7 @@ def add_arguments(self, parser): parser.add_argument( "--skip-github-sync", action="store_true", - help="Skip API fetch / sync_raw_only.", + help="Skip API fetch / sync_clang_github_activity (raw JSON + DB upserts).", ) parser.add_argument( "--skip-markdown-export", @@ -130,8 +130,8 @@ def handle(self, *args, **options): except ValueError as e: raise CommandError(str(e)) from e - start_commit, start_item, end_date = clang_state.resolve_start_end_dates( - since, until + start_commit, start_item, end_date = ( + clang_state.resolve_start_end_dates(since, until) ) logger.info( "Resolved: start_commit=%r start_item=%r end=%r", @@ -140,29 +140,43 @@ def handle(self, *args, **options): end_date, ) + # Dry run + if dry_run: if not skip_github_sync: - logger.info("dry-run: would run GitHub sync for llvm/llvm-project") + logger.info( + "dry-run: would run GitHub sync for llvm/llvm-project" + ) else: - logger.info("dry-run: skipping GitHub sync (--skip-github-sync)") + logger.info( + "dry-run: skipping GitHub sync (--skip-github-sync)" + ) if not skip_markdown_export: - logger.info("dry-run: would export Markdown for issues/PRs from sync") + logger.info( + "dry-run: would export Markdown for issues/PRs from sync" + ) if not skip_remote_push: logger.info("dry-run: would push Markdown to private repo") if not skip_pinecone: - logger.info("dry-run: would run Pinecone upsert for issues and PRs") + logger.info( + "dry-run: would run Pinecone upsert for issues and PRs" + ) logger.info("dry-run finished") return issue_numbers: list[int] = [] pr_numbers: list[int] = [] + # GitHub sync + if not skip_github_sync: try: - commits_saved, issue_numbers, pr_numbers = sync_raw_only( - start_commit=start_commit, - start_item=start_item, - end_date=end_date, + commits_saved, issue_numbers, pr_numbers = ( + sync_clang_github_activity( + start_commit=start_commit, + start_item=start_item, + end_date=end_date, + ) ) logger.info( "run_clang_github_tracker: sync done; commits=%s issues=%s prs=%s", @@ -176,6 +190,8 @@ def handle(self, *args, **options): else: logger.info("skipping GitHub sync (--skip-github-sync)") + # Markdown export + md_output_dir = get_workspace_root() / "md_export" md_output_dir.mkdir(parents=True, exist_ok=True) @@ -204,24 +220,21 @@ def handle(self, *args, **options): else: logger.info("skipping Markdown export (--skip-markdown-export)") + # Remote push + if not skip_remote_push: - if not new_files: - if skip_markdown_export and not skip_github_sync: - logger.warning( - "nothing new to push (--skip-markdown-export); skipping remote push" - ) - elif skip_github_sync: - logger.warning("nothing to push from this run (sync was skipped)") - elif not issue_numbers and not pr_numbers: - logger.info("no MD files to push (no issues/PRs in sync)") - else: - self._push_markdown(md_output_dir, new_files) + logger.info("push Markdown to configured GitHub repo") + self._push_markdown(md_output_dir, new_files) else: logger.info("skipping remote push (--skip-remote-push)") + # Pinecone sync + if not skip_pinecone: app_type = (settings.CLANG_GITHUB_PINECONE_APP_TYPE or "").strip() - namespace = (settings.CLANG_GITHUB_PINECONE_NAMESPACE or "").strip() + namespace = ( + settings.CLANG_GITHUB_PINECONE_NAMESPACE or "" + ).strip() _run_pinecone_sync( f"{app_type}-issues", namespace, @@ -237,7 +250,9 @@ def handle(self, *args, **options): logger.info("run_clang_github_tracker finished successfully") - def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None: + def _push_markdown( + self, md_output_dir: Path, new_files: dict[str, str] + ) -> None: private_owner = getattr( settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" ).strip() diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index 2d43d88..b8f875a 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -51,7 +51,11 @@ def upsert_commit( sha=sha_clean, defaults={"github_committed_at": github_committed_at}, ) - logger.debug("clang commit %s %s", sha_clean[:8], "created" if created else "updated") + logger.debug( + "clang commit %s %s", + sha_clean[:8], + "created" if created else "updated", + ) return obj, created @@ -68,7 +72,7 @@ def get_commit_watermark() -> Optional[datetime]: def start_after_watermark(max_dt: datetime | None) -> datetime | None: - """Return ``max + 1s`` for API fetch lower bound, or ``None`` if no watermark.""" + """Return ``max + 1ms`` for API fetch lower bound, or ``None`` if no watermark.""" if max_dt is None: return None - return max_dt + timedelta(seconds=1) + return max_dt + timedelta(milliseconds=1) diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index 436cc64..5c6ba1f 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -38,30 +38,35 @@ def _aware_utc(dt: datetime | None) -> datetime | None: return dt.astimezone(timezone.utc) -def _apply_since_floor(cursor_start: datetime | None, since: datetime | None) -> datetime | None: - """Lower bound: max(DB cursor, ``since``) when ``since`` is set; else DB cursor.""" - if since is None: - return cursor_start - s = _aware_utc(since) - if cursor_start is None: - return s - c = _aware_utc(cursor_start) - assert c is not None and s is not None - return max(c, s) - - def resolve_start_end_dates( since: datetime | None, until: datetime | None, -) -> tuple[datetime | None, datetime | None, datetime]: +) -> tuple[datetime | None, datetime | None, datetime | None]: """ - Resolve ``start_commit``, unified ``start_item`` (issues+PRs), and ``end_date``. + Build GitHub sync window: ``(start_commit, start_item, end_date)`` in UTC. + + ``start_item`` is the single lower bound for the unified issues+PRs ``/issues`` fetch; + ``start_commit`` is the lower bound for the commits stream. Missing bounds mean + “from beginning” for starts. Naive datetimes are treated as UTC. + + **Closed window** — both ``since`` and ``until`` are set: - - If both ``since`` and ``until`` are set and ``since <= until``: use ``since`` for - both starts and ``until`` as end. - - If ``since > until``: log warning and ignore both bounds (fall back to DB + now). - - Otherwise: starts from DB max + 1s (or None if empty/null max), with optional - ``since`` as a per-stream floor. End is ``until`` or ``timezone.now()``. + - If ``since <= until``: return ``(since, since, until)`` (same lower bound for both + streams; explicit end). + - If ``since > until``: log a warning, discard both CLI bounds, then use the + **DB watermark** path below. ``end_date`` is ``None``. + + **Otherwise** (no ``since``, or only one side after the rules above): + + - ``end_date`` is ``until`` when ``until`` was provided, else ``None``. A ``None`` + end means “through now” for callers; ``sync_clang_github_activity`` substitutes + ``timezone.now()`` before fetching. + + - **Starts:** If ``since`` is set (without a valid closed window): ``start_commit`` + and ``start_item`` are both ``since``. If ``since`` is not set: both are + ``Max(github_* timestamp) + 1 second`` from the DB when a watermark exists, else + ``None`` (full history). Watermarks use ``Max(github_committed_at)`` and + ``Max(github_updated_at)`` on ``ClangGithubCommit`` / ``ClangGithubIssueItem``. """ since_aware = _aware_utc(since) until_aware = _aware_utc(until) @@ -69,7 +74,8 @@ def resolve_start_end_dates( if since_aware is not None and until_aware is not None: if since_aware > until_aware: logger.warning( - "invalid date range: since (%s) is after until (%s); using DB cursors and default end", + "invalid date range: since (%s) is after until (%s); " + "using DB cursors; end_date None (sync applies now if needed)", since_aware, until_aware, ) @@ -77,12 +83,13 @@ def resolve_start_end_dates( else: return since_aware, since_aware, until_aware - end_date = until_aware if until_aware is not None else timezone.now() - - item_wm = start_after_watermark(get_issue_item_watermark()) - commit_wm = start_after_watermark(get_commit_watermark()) + end_date = until_aware - start_item = _apply_since_floor(item_wm, since_aware) - start_commit = _apply_since_floor(commit_wm, since_aware) + if since_aware is None: + item_wm = start_after_watermark(get_issue_item_watermark()) + commit_wm = start_after_watermark(get_commit_watermark()) + else: + item_wm = since_aware + commit_wm = since_aware - return start_commit, start_item, end_date + return commit_wm, item_wm, end_date diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 90157b7..9492373 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -50,7 +50,7 @@ def _commit_date(commit_data: dict) -> datetime | None: return parse_datetime(date_str) or clang_state.parse_iso(date_str) -def sync_raw_only( +def sync_clang_github_activity( start_commit: datetime | None = None, start_item: datetime | None = None, end_date: Optional[datetime] = None, @@ -62,17 +62,14 @@ def sync_raw_only( Args: start_commit: Start date for commits (None = from beginning). start_item: Single lower bound for unified issues+PRs ``/issues`` fetch. - end_date: End date for all (default: now). + end_date: End date for all (default: None = sync through now). Returns: (commits_saved, issue_numbers, pr_numbers). """ - from django.utils import timezone as django_tz owner = OWNER repo = REPO - if end_date is None: - end_date = django_tz.now() end_date = _ensure_utc(end_date) start_commit = _ensure_utc(start_commit) start_item = _ensure_utc(start_item) @@ -114,13 +111,17 @@ def sync_raw_only( clang_services.upsert_issue_item( num, is_pull_request=True, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + github_created_at=parse_datetime( + flat.get("created_at") + ), + github_updated_at=parse_datetime( + flat.get("updated_at") + ), ) else: - issue_number = (item.get("issue_info") or {}).get("number") or item.get( + issue_number = (item.get("issue_info") or {}).get( "number" - ) + ) or item.get("number") if issue_number is not None: save_issue_raw_source(owner, repo, item) issue_numbers.append(issue_number) @@ -130,8 +131,12 @@ def sync_raw_only( clang_services.upsert_issue_item( num, is_pull_request=False, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + github_created_at=parse_datetime( + flat.get("created_at") + ), + github_updated_at=parse_datetime( + flat.get("updated_at") + ), ) except (ConnectionException, RateLimitException) as e: diff --git a/clang_github_tracker/tests/test_commands.py b/clang_github_tracker/tests/test_commands.py index 95a0ff6..3150495 100644 --- a/clang_github_tracker/tests/test_commands.py +++ b/clang_github_tracker/tests/test_commands.py @@ -15,7 +15,7 @@ def test_run_clang_github_tracker_dry_run_logs_resolved(caplog): """Dry run resolves dates from DB and does not call sync.""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only" + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" ) as sync_mock: with caplog.at_level(logging.INFO): call_command(CMD_NAME, "--dry-run", stdout=StringIO(), stderr=StringIO()) @@ -42,7 +42,7 @@ def test_run_clang_github_tracker_dry_run_skip_sync(caplog): def test_run_clang_github_tracker_since_until_aliases(caplog): """--from-date/--to-date aliases parse like Boost.""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only" + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" ) as sync_mock: with caplog.at_level(logging.INFO): call_command( @@ -58,10 +58,12 @@ def test_run_clang_github_tracker_since_until_aliases(caplog): @pytest.mark.django_db -def test_run_clang_github_tracker_calls_sync_raw_only_when_not_dry_run(caplog): - """Without --dry-run, command calls sync_raw_only with start_item.""" +def test_run_clang_github_tracker_calls_sync_clang_github_activity_when_not_dry_run( + caplog, +): + """Without --dry-run, command calls sync_clang_github_activity with start_item.""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only", + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", return_value=(0, [], []), ) as sync_mock: with caplog.at_level(logging.INFO): @@ -85,7 +87,7 @@ def test_run_clang_github_tracker_calls_sync_raw_only_when_not_dry_run(caplog): def test_run_clang_github_tracker_skip_pinecone(caplog): """--skip-pinecone does not call run_cppa_pinecone_sync.""" with patch( - "clang_github_tracker.management.commands.run_clang_github_tracker.sync_raw_only", + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", return_value=(0, [1], []), ): with patch( diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index fdf9428..d4fc848 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -29,14 +29,12 @@ def test_parse_iso_invalid_or_empty(): @pytest.mark.django_db def test_resolve_empty_db_no_since_until(): - """Empty tables → None starts; end is now (approximately).""" + """Empty tables → None starts; end None until caller passes --until.""" ClangGithubIssueItem.objects.all().delete() ClangGithubCommit.objects.all().delete() sc, si, end = clang_state.resolve_start_end_dates(None, None) assert sc is None and si is None - assert end is not None - now = timezone.now() - assert abs((end - now).total_seconds()) < 5 + assert end is None @pytest.mark.django_db @@ -82,7 +80,7 @@ def test_resolve_invalid_range_clears_bounds(caplog): with caplog.at_level("WARNING"): sc, si, end = clang_state.resolve_start_end_dates(since, until) assert any("invalid date range" in r.getMessage() for r in caplog.records) - assert end is not None + assert end is None assert sc is not None and si is not None diff --git a/docs/service_api/clang_github_tracker.md b/docs/service_api/clang_github_tracker.md index 8d6a7ee..8655cbc 100644 --- a/docs/service_api/clang_github_tracker.md +++ b/docs/service_api/clang_github_tracker.md @@ -1,7 +1,7 @@ # clang_github_tracker.services **Module path:** `clang_github_tracker.services` -**Description:** Upserts for `ClangGithubIssueItem` and `ClangGithubCommit` (no FKs to other apps). Used by `sync_raw_only`, `backfill_clang_github_tracker`, and date resolution watermarks. +**Description:** Upserts for `ClangGithubIssueItem` and `ClangGithubCommit` (no FKs to other apps). Used by `sync_clang_github_activity`, `backfill_clang_github_tracker`, and date resolution watermarks. **Type notation:** Models live in `clang_github_tracker.models`. From d5c9534d584796e8b82d036f7326c8f08194110c Mon Sep 17 00:00:00 2001 From: snowfox1003 Date: Wed, 1 Apr 2026 23:18:36 -0400 Subject: [PATCH 62/76] Add Clang markdown publishing with context-repo settings, chunked raw backfill upserts, batch DB helpers, stale-export detection in github_export, safer git clone errors without leaking tokens, and aligned tests including isolated test_settings for publish - #136 --- .env.example | 20 +- .../commands/backfill_clang_github_tracker.py | 204 +++++++++++------- .../commands/run_clang_github_tracker.py | 116 ++++------ clang_github_tracker/publisher.py | 191 ++++++++++++++++ clang_github_tracker/services.py | 111 ++++++++++ clang_github_tracker/sync_raw.py | 20 +- clang_github_tracker/tests/test_backfill.py | 3 +- clang_github_tracker/tests/test_commands.py | 67 +++++- .../tests/test_preprocessors.py | 9 +- clang_github_tracker/tests/test_publisher.py | 150 +++++++++++++ clang_github_tracker/tests/test_services.py | 29 ++- .../tests/test_state_manager.py | 16 +- clang_github_tracker/workspace.py | 2 +- config/settings.py | 22 +- config/test_settings.py | 4 + docs/service_api/clang_github_tracker.md | 2 +- github_ops/git_ops.py | 14 +- operations/md_ops/github_export.py | 146 ++++++++----- operations/tests/test_github_export.py | 111 ++++++++++ 19 files changed, 978 insertions(+), 259 deletions(-) create mode 100644 clang_github_tracker/publisher.py create mode 100644 clang_github_tracker/tests/test_publisher.py create mode 100644 operations/tests/test_github_export.py diff --git a/.env.example b/.env.example index fca6343..468461a 100644 --- a/.env.example +++ b/.env.example @@ -76,19 +76,6 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # Slack webhook URL (get from Slack: https://api.slack.com/messaging/webhooks) # SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL -# ============================================================================= -# Clang GitHub Tracker -# ============================================================================= -# GitHub repo to sync (default: llvm/llvm-project). -# CLANG_GITHUB_OWNER=llvm -# CLANG_GITHUB_REPO=llvm-project -# -# Private repo for Markdown export (optional). -# Issues/PRs are exported to: issues/YYYY/YYYY-MM/#N - title.md -# If unset, upload is skipped and an error is logged. -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER=your-org -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME=your-private-repo -# CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH=main # ============================================================================= # GitHub tokens (multiple use cases) @@ -108,6 +95,13 @@ DATABASE_URL=postgres://user:password@localhost:5432/boost_dashboard # GitHub repo to sync (default: llvm/llvm-project). # CLANG_GITHUB_OWNER=llvm # CLANG_GITHUB_REPO=llvm-project +# Markdown publish target (optional; see also Clang section above). +# CLANG_GITHUB_CONTEXT_REPO_OWNER=your-org +# CLANG_GITHUB_CONTEXT_REPO_NAME=your-repo +# CLANG_GITHUB_CONTEXT_REPO_BRANCH=main +# If that repo is private: set GITHUB_TOKEN_WRITE to a PAT that can read+push it +# (classic: repo scope; fine-grained: grant this repository). Publish uses the +# write token, not GITHUB_TOKENS_SCRAPING. # Pinecone sync (run_cppa_pinecone_sync) — app_type and namespace when triggering from this app. # CLANG_GITHUB_PINECONE_APP_TYPE=github-clang # CLANG_GITHUB_PINECONE_NAMESPACE=github-clang diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py index e5380e6..9657754 100644 --- a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -8,6 +8,7 @@ import json import logging import re +from datetime import datetime from pathlib import Path from django.core.management.base import BaseCommand, CommandError @@ -29,6 +30,7 @@ logger = logging.getLogger(__name__) _SHA40 = re.compile(r"^[0-9a-fA-F]{40}$") +_RAW_CHUNK_EVERY = 10_000 def _commit_date_from_json(data: dict): @@ -77,7 +79,9 @@ def handle(self, *args, **options): def _backfill_from_csv(self, path: Path) -> None: if not path.is_file(): raise CommandError(f"CSV not found: {path}") - inserted = updated = skipped = 0 + commit_rows: list[tuple[str, datetime | None]] = [] + issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] + skipped = 0 with path.open(encoding="utf-8", newline="") as f: reader = csv.DictReader(f) if not reader.fieldnames: @@ -87,28 +91,22 @@ def _backfill_from_csv(self, path: Path) -> None: try: if rt == "issue": num = int((row.get("number") or "").strip()) - gc = parse_datetime((row.get("github_created_at") or "").strip()) - gu = parse_datetime((row.get("github_updated_at") or "").strip()) - _, was_created = clang_services.upsert_issue_item( - num, - is_pull_request=False, - github_created_at=gc, - github_updated_at=gu, + gc = parse_datetime( + (row.get("github_created_at") or "").strip() + ) + gu = parse_datetime( + (row.get("github_updated_at") or "").strip() ) - inserted += bool(was_created) - updated += not was_created + issue_rows.append((num, False, gc, gu)) elif rt == "pr": num = int((row.get("number") or "").strip()) - gc = parse_datetime((row.get("github_created_at") or "").strip()) - gu = parse_datetime((row.get("github_updated_at") or "").strip()) - _, was_created = clang_services.upsert_issue_item( - num, - is_pull_request=True, - github_created_at=gc, - github_updated_at=gu, + gc = parse_datetime( + (row.get("github_created_at") or "").strip() + ) + gu = parse_datetime( + (row.get("github_updated_at") or "").strip() ) - inserted += bool(was_created) - updated += not was_created + issue_rows.append((num, True, gc, gu)) elif rt == "commit": sha = (row.get("sha") or "").strip() if not _SHA40.match(sha): @@ -118,21 +116,23 @@ def _backfill_from_csv(self, path: Path) -> None: gcm = parse_datetime( (row.get("github_committed_at") or "").strip() ) - _, was_created = clang_services.upsert_commit( - sha, github_committed_at=gcm - ) - inserted += bool(was_created) - updated += not was_created + commit_rows.append((sha, gcm)) else: logger.warning("skip row: unknown record_type %r", rt) skipped += 1 except (TypeError, ValueError) as e: logger.warning("skip row: %s (row=%r)", e, row) skipped += 1 + + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) logger.info( - "CSV backfill done: inserted=%s updated=%s skipped=%s path=%s", - inserted, - updated, + "CSV backfill done: issues_prs inserted=%s updated=%s commits inserted=%s " + "updated=%s skipped=%s path=%s", + ins_i, + upd_i, + ins_c, + upd_c, skipped, path, ) @@ -144,36 +144,53 @@ def _backfill_from_raw(self) -> None: commits_dir = root / "commits" if commits_dir.is_dir(): - c_ins = c_upd = c_skip = 0 - for p in commits_dir.glob("*.json"): + commit_rows: list[tuple[str, datetime | None]] = [] + c_skip = 0 + c_ins_total = c_upd_total = 0 + for c_read_n, p in enumerate(sorted(commits_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) sha = (data.get("sha") or "").strip() if not _SHA40.match(sha): c_skip += 1 continue - dt = _commit_date_from_json(data) - _, was_created = clang_services.upsert_commit( - sha, github_committed_at=dt - ) - if was_created: - c_ins += 1 - else: - c_upd += 1 + commit_rows.append((sha, _commit_date_from_json(data))) except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip commit file %s: %s", p, e) c_skip += 1 + if c_read_n % _RAW_CHUNK_EVERY == 0: + if commit_rows: + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) + c_ins_total += ins_c + c_upd_total += upd_c + commit_rows.clear() + logger.info( + "raw commits/: read %s JSON files; cumulative " + "inserted=%s updated=%s skipped=%s", + c_read_n, + c_ins_total, + c_upd_total, + c_skip, + ) + if commit_rows: + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) + c_ins_total += ins_c + c_upd_total += upd_c logger.info( - "raw commits/: inserted=%s updated=%s skipped=%s", - c_ins, - c_upd, + "raw commits/: done inserted=%s updated=%s skipped=%s", + c_ins_total, + c_upd_total, c_skip, ) + issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] + i_ins_total = i_upd_total = 0 + issues_dir = root / "issues" if issues_dir.is_dir(): - i_ins = i_upd = i_skip = 0 - for p in issues_dir.glob("*.json"): + i_skip = 0 + i_ok = 0 + for i_read_n, p in enumerate(sorted(issues_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_issue_json(data) @@ -181,55 +198,90 @@ def _backfill_from_raw(self) -> None: if not isinstance(num, int) or num <= 0: i_skip += 1 continue - _, was_created = clang_services.upsert_issue_item( - num, - is_pull_request=False, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + issue_rows.append( + ( + num, + False, + parse_datetime(flat.get("created_at")), + parse_datetime(flat.get("updated_at")), + ) ) - if was_created: - i_ins += 1 - else: - i_upd += 1 + i_ok += 1 except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip issue file %s: %s", p, e) i_skip += 1 - logger.info( - "raw issues/: inserted=%s updated=%s skipped=%s", - i_ins, - i_upd, - i_skip, - ) + if i_read_n % _RAW_CHUNK_EVERY == 0: + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info( + "raw issues/: read %s JSON files; cumulative " + "issues+prs inserted=%s updated=%s", + i_read_n, + i_ins_total, + i_upd_total, + ) + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info("raw issues/: parsed_ok=%s skipped=%s", i_ok, i_skip) prs_dir = root / "prs" if prs_dir.is_dir(): - p_ins = p_upd = p_skip = 0 - for p in prs_dir.glob("*.json"): + pr_skip = 0 + pr_ok = 0 + for pr_read_n, p in enumerate(sorted(prs_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_pr_json(data) num = flat.get("number") if not isinstance(num, int) or num <= 0: - p_skip += 1 + pr_skip += 1 continue - _, was_created = clang_services.upsert_issue_item( - num, - is_pull_request=True, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + issue_rows.append( + ( + num, + True, + parse_datetime(flat.get("created_at")), + parse_datetime(flat.get("updated_at")), + ) ) - if was_created: - p_ins += 1 - else: - p_upd += 1 + pr_ok += 1 except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip pr file %s: %s", p, e) - p_skip += 1 - logger.info( - "raw prs/: inserted=%s updated=%s skipped=%s", - p_ins, - p_upd, - p_skip, - ) + pr_skip += 1 + if pr_read_n % _RAW_CHUNK_EVERY == 0: + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info( + "raw prs/: read %s JSON files; cumulative " + "issues+prs inserted=%s updated=%s", + pr_read_n, + i_ins_total, + i_upd_total, + ) + if issue_rows: + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + i_ins_total += ins_i + i_upd_total += upd_i + issue_rows.clear() + logger.info("raw prs/: parsed_ok=%s skipped=%s", pr_ok, pr_skip) + + logger.info( + "raw issues+prs DB total: inserted=%s updated=%s", + i_ins_total, + i_upd_total, + ) logger.info("raw backfill finished root=%s", root) diff --git a/clang_github_tracker/management/commands/run_clang_github_tracker.py b/clang_github_tracker/management/commands/run_clang_github_tracker.py index b3d6552..e079ce1 100644 --- a/clang_github_tracker/management/commands/run_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/run_clang_github_tracker.py @@ -2,7 +2,7 @@ Management command: run_clang_github_tracker Fetches GitHub activity for llvm/llvm-project, saves raw JSON and DB rows, optionally -exports Markdown and pushes to the private repo. Resume uses DB watermarks (not state.json). +exports Markdown and pushes to the configured Clang markdown GitHub repo. Resume uses DB watermarks (not state.json). """ import logging @@ -15,17 +15,14 @@ from core.utils.datetime_parsing import parse_iso_datetime from clang_github_tracker import state_manager as clang_state from clang_github_tracker.sync_raw import sync_clang_github_activity +from clang_github_tracker.publisher import publish_clang_markdown from clang_github_tracker.workspace import OWNER, REPO, get_workspace_root -from github_ops import get_github_token, upload_folder_to_github -from operations.md_ops.github_export import ( - detect_renames_from_dirs, - write_md_files, -) +from operations.md_ops.github_export import write_md_files logger = logging.getLogger(__name__) -DEFAULT_PRIVATE_MD_BRANCH = "master" +DEFAULT_CLANG_REPO_BRANCH = "master" def _run_pinecone_sync( @@ -89,7 +86,7 @@ def add_arguments(self, parser): parser.add_argument( "--skip-remote-push", action="store_true", - help="Skip push to CLANG_GITHUB_TRACKER_PRIVATE_REPO_*.", + help="Skip push to the repo configured via CLANG_GITHUB_CONTEXT_REPO_OWNER / CLANG_GITHUB_CONTEXT_REPO_NAME.", ) parser.add_argument( "--skip-pinecone", @@ -130,8 +127,8 @@ def handle(self, *args, **options): except ValueError as e: raise CommandError(str(e)) from e - start_commit, start_item, end_date = ( - clang_state.resolve_start_end_dates(since, until) + start_commit, start_item, end_date = clang_state.resolve_start_end_dates( + since, until ) logger.info( "Resolved: start_commit=%r start_item=%r end=%r", @@ -144,23 +141,15 @@ def handle(self, *args, **options): if dry_run: if not skip_github_sync: - logger.info( - "dry-run: would run GitHub sync for llvm/llvm-project" - ) + logger.info("dry-run: would run GitHub sync for llvm/llvm-project") else: - logger.info( - "dry-run: skipping GitHub sync (--skip-github-sync)" - ) + logger.info("dry-run: skipping GitHub sync (--skip-github-sync)") if not skip_markdown_export: - logger.info( - "dry-run: would export Markdown for issues/PRs from sync" - ) + logger.info("dry-run: would export Markdown for issues/PRs from sync") if not skip_remote_push: - logger.info("dry-run: would push Markdown to private repo") + logger.info("dry-run: would push Markdown to configured Clang repo") if not skip_pinecone: - logger.info( - "dry-run: would run Pinecone upsert for issues and PRs" - ) + logger.info("dry-run: would run Pinecone upsert for issues and PRs") logger.info("dry-run finished") return @@ -171,12 +160,10 @@ def handle(self, *args, **options): if not skip_github_sync: try: - commits_saved, issue_numbers, pr_numbers = ( - sync_clang_github_activity( - start_commit=start_commit, - start_item=start_item, - end_date=end_date, - ) + commits_saved, issue_numbers, pr_numbers = sync_clang_github_activity( + start_commit=start_commit, + start_item=start_item, + end_date=end_date, ) logger.info( "run_clang_github_tracker: sync done; commits=%s issues=%s prs=%s", @@ -232,9 +219,7 @@ def handle(self, *args, **options): if not skip_pinecone: app_type = (settings.CLANG_GITHUB_PINECONE_APP_TYPE or "").strip() - namespace = ( - settings.CLANG_GITHUB_PINECONE_NAMESPACE or "" - ).strip() + namespace = (settings.CLANG_GITHUB_PINECONE_NAMESPACE or "").strip() _run_pinecone_sync( f"{app_type}-issues", namespace, @@ -250,62 +235,35 @@ def handle(self, *args, **options): logger.info("run_clang_github_tracker finished successfully") - def _push_markdown( - self, md_output_dir: Path, new_files: dict[str, str] - ) -> None: - private_owner = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", "" + def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None: + clang_github_context_repo_owner = getattr( + settings, "CLANG_GITHUB_CONTEXT_REPO_OWNER", "" ).strip() - private_repo_name = getattr( - settings, "CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", "" + clang_github_context_repo_name = getattr( + settings, "CLANG_GITHUB_CONTEXT_REPO_NAME", "" ).strip() - private_branch = ( + clang_github_context_repo_branch = ( getattr( settings, - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", - DEFAULT_PRIVATE_MD_BRANCH, + "CLANG_GITHUB_CONTEXT_REPO_BRANCH", + DEFAULT_CLANG_REPO_BRANCH, ) - or DEFAULT_PRIVATE_MD_BRANCH + or DEFAULT_CLANG_REPO_BRANCH ).strip() - if not private_owner or not private_repo_name: + if not clang_github_context_repo_owner or not clang_github_context_repo_name: logger.error( - "CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER / CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME " - "not configured; skipping upload." + "CLANG_GITHUB_CONTEXT_REPO_OWNER / CLANG_GITHUB_CONTEXT_REPO_NAME " + "not configured; skipping Markdown push." ) return - token = get_github_token(use="write") - delete_paths = detect_renames_from_dirs( - private_owner, - private_repo_name, - private_branch, + publish_clang_markdown( + md_output_dir, + clang_github_context_repo_owner, + clang_github_context_repo_name, + clang_github_context_repo_branch, new_files, - token=token, - ) - for repo_rel in delete_paths: - stale_local = md_output_dir / repo_rel - if stale_local.exists(): - stale_local.unlink() - if delete_paths: - logger.info( - "run_clang_github_tracker: %s renamed file(s) to delete.", - len(delete_paths), - ) - - result = upload_folder_to_github( - local_folder=md_output_dir, - owner=private_owner, - repo=private_repo_name, - commit_message="chore: update Clang issues/PRs markdown", - branch=private_branch, - delete_paths=delete_paths or None, ) - - if result.get("success"): - logger.info("run_clang_github_tracker: MD upload complete.") - for local_path in new_files.values(): - Path(local_path).unlink(missing_ok=True) - else: - msg = result.get("message") or "Upload failed" - logger.error("run_clang_github_tracker: MD upload failed: %s", msg) - raise CommandError(msg) + logger.info("run_clang_github_tracker: MD publish complete.") + for local_path in new_files.values(): + Path(local_path).unlink(missing_ok=True) diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py new file mode 100644 index 0000000..6472c05 --- /dev/null +++ b/clang_github_tracker/publisher.py @@ -0,0 +1,191 @@ +"""Publish Clang markdown export to GitHub via a persistent clone.""" + +from __future__ import annotations + +import logging +import re +import shutil +import subprocess +from pathlib import Path + +from django.conf import settings +from django.core.management.base import CommandError + +from github_ops.git_ops import clone_repo, prepare_repo_for_pull, pull, push as git_push +from github_ops.tokens import get_github_token +from operations.md_ops.github_export import detect_stale_titled_paths + +logger = logging.getLogger(__name__) + +_GITHUB_OWNER_REPO_SLUG = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?$") + + +def _validate_github_slug(label: str, value: str) -> str: + """Return stripped owner or repo name, or raise CommandError if unsafe or invalid.""" + v = (value or "").strip() + if not v: + raise CommandError(f"Invalid GitHub {label}: empty") + if v in (".", ".."): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if "/" in v or "\\" in v: + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if Path(v).is_absolute(): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + if not _GITHUB_OWNER_REPO_SLUG.fullmatch(v): + raise CommandError(f"Invalid GitHub {label}: {v!r}") + return v + + +def _reset_hard_to_upstream(clone_dir: Path, remote: str, branch: str) -> None: + """Match origin/ after pull so unpushed local commits from a failed push are dropped.""" + ref = f"{remote}/{branch}" + try: + subprocess.run( + ["git", "-C", str(clone_dir), "reset", "--hard", ref], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + except subprocess.CalledProcessError as e: + err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + raise CommandError( + f"Could not reset clone to {ref}: {err}" + ) from e + + +def _copy_md_tree(md_output_dir: Path, clone_dir: Path) -> None: + """Copy all files under md_output_dir into clone_dir (preserve relative paths).""" + md_output_dir = md_output_dir.resolve() + clone_dir = clone_dir.resolve() + for path in md_output_dir.rglob("*"): + if not path.is_file(): + continue + if ".git" in path.relative_to(md_output_dir).parts: + continue + rel = path.relative_to(md_output_dir) + dest = clone_dir / rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(path, dest) + + +def publish_clang_markdown( + md_output_dir: Path, + owner: str, + repo: str, + branch: str, + new_files: dict[str, str], +) -> None: + """ + Clone (if needed) at RAW_DIR/clang_github_tracker//, fetch/clean/pull, + align to origin/, remove stale titled .md in md_export and clone, overlay + md_export into the clone, commit and push. + + Uses get_github_token(use=\"write\") and settings GIT_AUTHOR_* for the commit. + """ + owner = _validate_github_slug("owner", owner) + repo = _validate_github_slug("repo", repo) + + publish_root = (Path(settings.RAW_DIR) / "clang_github_tracker").resolve() + clone_dir = (publish_root / owner / repo).resolve() + try: + clone_dir.relative_to(publish_root) + except ValueError as e: + raise CommandError( + f"Publish clone path escapes clang publish root: {clone_dir}" + ) from e + + md_output_dir = md_output_dir.resolve() + if ( + clone_dir == md_output_dir + or clone_dir in md_output_dir.parents + or md_output_dir in clone_dir.parents + ): + raise CommandError( + "Markdown output directory must not overlap with the publish clone path: " + f"{clone_dir}" + ) + + # Private CLANG_GITHUB_CONTEXT_* repos need a PAT that can read them (clone/pull) + # and push; get_github_token("write") uses GITHUB_TOKEN_WRITE or GITHUB_TOKEN. + token = get_github_token(use="write") + git_user_name = (getattr(settings, "GIT_AUTHOR_NAME", None) or "unknown").strip() + git_user_email = ( + getattr(settings, "GIT_AUTHOR_EMAIL", None) or "unknown@noreply.github.com" + ).strip() + + repo_slug = f"{owner}/{repo}" + logger.info("Publishing Clang markdown to %s (%s)...", repo_slug, branch) + logger.info( + "Publish git operations use the write token (GITHUB_TOKEN_WRITE, else " + "GITHUB_TOKEN). For a private target repo, that PAT must be granted access " + "to %s.", + repo_slug, + ) + + clone_dir.parent.mkdir(parents=True, exist_ok=True) + if not clone_dir.exists() or not (clone_dir / ".git").is_dir(): + if clone_dir.exists(): + shutil.rmtree(clone_dir) + logger.info("Cloning %s to %s", repo_slug, clone_dir) + try: + clone_repo(repo_slug, clone_dir, token=token) + except subprocess.CalledProcessError as e: + tail = ((e.stderr or "") + (e.stdout or "")).strip() + hint = ( + "Clone already uses get_github_token(use='write') (GITHUB_TOKEN_WRITE " + "or GITHUB_TOKEN). Verify CLANG_GITHUB_CONTEXT_REPO_OWNER / _NAME, " + "and that this PAT can access the repo: for a private repo use a " + "classic PAT with 'repo' scope or a fine-grained PAT with access to " + "that repository. GitHub often returns 'not found' when the token " + "lacks access." + ) + logger.error("clang_github_tracker publish: git clone failed: %s", tail or e) + raise CommandError( + f"Git clone failed for {repo_slug}: {tail or e.returncode}. {hint}" + ) from e + + logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) + prepare_repo_for_pull(clone_dir, remote="origin", token=token) + + logger.info("Pulling latest for %s", clone_dir) + pull(clone_dir, branch=branch, token=token) + + logger.info("Resetting clone to origin/%s (discard unpushed commits)", branch) + _reset_hard_to_upstream(clone_dir, "origin", branch) + + stale_md = detect_stale_titled_paths(md_output_dir, new_files) + stale_clone = detect_stale_titled_paths(clone_dir, new_files) + all_stale = sorted(set(stale_md) | set(stale_clone)) + + for rel in all_stale: + for base in (md_output_dir, clone_dir): + p = base / rel + if p.is_file(): + p.unlink() + + if all_stale: + logger.info( + "clang_github_tracker publish: removed %s stale titled file(s).", + len(all_stale), + ) + + _copy_md_tree(md_output_dir, clone_dir) + + try: + git_push( + clone_dir, + remote="origin", + branch=branch, + commit_message="chore: update Clang issues/PRs markdown", + token=token, + git_user_name=git_user_name, + git_user_email=git_user_email, + ) + except subprocess.CalledProcessError as e: + err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + logger.error("clang_github_tracker publish: git push failed: %s", err) + raise CommandError(f"Git push failed: {err}") from e + + logger.info("Clang markdown published successfully to %s.", repo_slug) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index b8f875a..b8ba84f 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from collections.abc import Sequence from datetime import datetime, timedelta from typing import Optional @@ -12,6 +13,8 @@ logger = logging.getLogger(__name__) +DEFAULT_UPSERT_BATCH_SIZE = 500 + def upsert_issue_item( number: int, @@ -59,6 +62,114 @@ def upsert_commit( return obj, created +def _flush_commits_chunk( + pairs: list[tuple[str, datetime | None]], +) -> tuple[int, int]: + """Write one chunk; returns (inserted_count, updated_count).""" + if not pairs: + return 0, 0 + shas = [s for s, _ in pairs] + existing = set( + ClangGithubCommit.objects.filter(sha__in=shas).values_list("sha", flat=True) + ) + objs = [ClangGithubCommit(sha=s, github_committed_at=dt) for s, dt in pairs] + ClangGithubCommit.objects.bulk_create( + objs, + batch_size=len(objs), + update_conflicts=True, + unique_fields=["sha"], + update_fields=["github_committed_at"], + ) + inserted = sum(1 for s, _ in pairs if s not in existing) + updated = len(pairs) - inserted + return inserted, updated + + +def upsert_commits_batch( + rows: Sequence[tuple[str, datetime | None]], + *, + batch_size: int = DEFAULT_UPSERT_BATCH_SIZE, +) -> tuple[int, int]: + """Batch upsert commits by ``sha``. Skips rows whose sha is not 40 chars. + + Returns: + (inserted, updated) counts across all batches. + """ + merged: dict[str, datetime | None] = {} + for sha, dt in rows: + s = (sha or "").strip() + if len(s) != 40: + continue + merged[s] = dt + inserted = updated = 0 + items = list(merged.items()) + for i in range(0, len(items), batch_size): + di, du = _flush_commits_chunk(items[i : i + batch_size]) + inserted += di + updated += du + return inserted, updated + + +def _flush_issue_items_chunk( + rows: list[tuple[int, bool, datetime | None, datetime | None]], +) -> tuple[int, int]: + if not rows: + return 0, 0 + nums = [n for n, _, _, _ in rows] + existing = set( + ClangGithubIssueItem.objects.filter(number__in=nums).values_list( + "number", flat=True + ) + ) + objs = [ + ClangGithubIssueItem( + number=n, + is_pull_request=is_pr, + github_created_at=gc, + github_updated_at=gu, + ) + for n, is_pr, gc, gu in rows + ] + ClangGithubIssueItem.objects.bulk_create( + objs, + batch_size=len(objs), + update_conflicts=True, + unique_fields=["number"], + update_fields=[ + "is_pull_request", + "github_created_at", + "github_updated_at", + ], + ) + inserted = sum(1 for n, _, _, _ in rows if n not in existing) + updated = len(rows) - inserted + return inserted, updated + + +def upsert_issue_items_batch( + rows: Sequence[tuple[int, bool, datetime | None, datetime | None]], + *, + batch_size: int = DEFAULT_UPSERT_BATCH_SIZE, +) -> tuple[int, int]: + """Batch upsert issue/PR rows by ``number``. Later rows win on duplicate numbers. + + Returns: + (inserted, updated) counts across all batches. + """ + merged: dict[int, tuple[bool, datetime | None, datetime | None]] = {} + for num, is_pr, gc, gu in rows: + if not isinstance(num, int) or num <= 0: + continue + merged[num] = (is_pr, gc, gu) + inserted = updated = 0 + items = [(n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items())] + for i in range(0, len(items), batch_size): + di, du = _flush_issue_items_chunk(items[i : i + batch_size]) + inserted += di + updated += du + return inserted, updated + + def get_issue_item_watermark() -> Optional[datetime]: """Max ``github_updated_at`` across issues and PRs (API fetch cursor base).""" m = ClangGithubIssueItem.objects.aggregate(m=Max("github_updated_at"))["m"] diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 9492373..5f2dee0 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -111,17 +111,13 @@ def sync_clang_github_activity( clang_services.upsert_issue_item( num, is_pull_request=True, - github_created_at=parse_datetime( - flat.get("created_at") - ), - github_updated_at=parse_datetime( - flat.get("updated_at") - ), + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), ) else: - issue_number = (item.get("issue_info") or {}).get( + issue_number = (item.get("issue_info") or {}).get("number") or item.get( "number" - ) or item.get("number") + ) if issue_number is not None: save_issue_raw_source(owner, repo, item) issue_numbers.append(issue_number) @@ -131,12 +127,8 @@ def sync_clang_github_activity( clang_services.upsert_issue_item( num, is_pull_request=False, - github_created_at=parse_datetime( - flat.get("created_at") - ), - github_updated_at=parse_datetime( - flat.get("updated_at") - ), + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), ) except (ConnectionException, RateLimitException) as e: diff --git a/clang_github_tracker/tests/test_backfill.py b/clang_github_tracker/tests/test_backfill.py index 74f25a4..61138de 100644 --- a/clang_github_tracker/tests/test_backfill.py +++ b/clang_github_tracker/tests/test_backfill.py @@ -1,7 +1,6 @@ """Tests for backfill_clang_github_tracker.""" import json -from pathlib import Path import pytest from django.core.management import call_command @@ -70,7 +69,7 @@ def test_backfill_from_raw(tmp_path, monkeypatch): ) monkeypatch.setattr( - "clang_github_tracker.workspace.get_raw_repo_dir", + "clang_github_tracker.management.commands.backfill_clang_github_tracker.get_raw_repo_dir", lambda *a, **k: root, ) call_command("backfill_clang_github_tracker", "--from-raw") diff --git a/clang_github_tracker/tests/test_commands.py b/clang_github_tracker/tests/test_commands.py index 3150495..990b100 100644 --- a/clang_github_tracker/tests/test_commands.py +++ b/clang_github_tracker/tests/test_commands.py @@ -1,12 +1,13 @@ """Tests for clang_github_tracker management commands.""" import logging - -import pytest from io import StringIO from unittest.mock import patch +import pytest from django.core.management import call_command +from django.core.management.base import CommandError +from django.test import override_settings CMD_NAME = "run_clang_github_tracker" @@ -107,8 +108,64 @@ def test_run_clang_github_tracker_skip_pinecone(caplog): stderr=StringIO(), ) pinecone_calls = [ - c - for c in cc.call_args_list - if c[0] and c[0][0] == "run_cppa_pinecone_sync" + c for c in cc.call_args_list if c[0] and c[0][0] == "run_cppa_pinecone_sync" ] assert not pinecone_calls + + +@pytest.mark.django_db +@override_settings( + CLANG_GITHUB_CONTEXT_REPO_OWNER="myorg", + CLANG_GITHUB_CONTEXT_REPO_NAME="myrepo", + CLANG_GITHUB_CONTEXT_REPO_BRANCH="main", +) +def test_push_markdown_calls_publish_and_unlinks_new_files(tmp_path): + """_push_markdown invokes publish_clang_markdown then removes per-run md files.""" + md = tmp_path / "md_export" + md.mkdir() + f = md / "issues" / "2024" / "2024-01" + f.mkdir(parents=True) + one = f / "#1 - A.md" + one.write_text("x", encoding="utf-8") + new_files = {"issues/2024/2024-01/#1 - A.md": str(one)} + + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.publish_clang_markdown" + ) as pub: + from clang_github_tracker.management.commands.run_clang_github_tracker import ( + Command, + ) + + Command()._push_markdown(md, new_files) + + pub.assert_called_once_with(md, "myorg", "myrepo", "main", new_files) + assert not one.exists() + + +@pytest.mark.django_db +@override_settings( + CLANG_GITHUB_CONTEXT_REPO_OWNER="o", + CLANG_GITHUB_CONTEXT_REPO_NAME="r", + CLANG_GITHUB_CONTEXT_REPO_BRANCH="main", +) +def test_push_markdown_publish_failure_does_not_unlink(tmp_path): + """Failed publish leaves local md files in place.""" + md = tmp_path / "md_export" + md.mkdir() + one = md / "x.md" + one.write_text("keep", encoding="utf-8") + new_files = {"x.md": str(one)} + + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.publish_clang_markdown", + side_effect=CommandError("publish failed"), + ): + from clang_github_tracker.management.commands.run_clang_github_tracker import ( + Command, + ) + + with pytest.raises(CommandError, match="publish failed"): + Command()._push_markdown(md, new_files) + + assert one.exists() + assert one.read_text(encoding="utf-8") == "keep" diff --git a/clang_github_tracker/tests/test_preprocessors.py b/clang_github_tracker/tests/test_preprocessors.py index 1e517b6..eba6716 100644 --- a/clang_github_tracker/tests/test_preprocessors.py +++ b/clang_github_tracker/tests/test_preprocessors.py @@ -15,7 +15,10 @@ def test_issue_preprocessor_db_and_failed_ids(mock_build, tmp_path, settings): settings.CLANG_GITHUB_OWNER = "llvm" settings.CLANG_GITHUB_REPO = "llvm-project" - mock_build.return_value = {"content": "body", "metadata": {"doc_id": "u", "ids": "x"}} + mock_build.return_value = { + "content": "body", + "metadata": {"doc_id": "u", "ids": "x"}, + } p10 = tmp_path / "10.json" p10.write_text("{}", encoding="utf-8") @@ -44,7 +47,9 @@ def _issue_path(_owner, _repo, n): @pytest.mark.django_db @patch("clang_github_tracker.preprocessors.issue_preprocessor.build_issue_document") -def test_issue_preprocessor_all_rows_when_final_sync_none(mock_build, tmp_path, settings): +def test_issue_preprocessor_all_rows_when_final_sync_none( + mock_build, tmp_path, settings +): settings.CLANG_GITHUB_OWNER = "llvm" settings.CLANG_GITHUB_REPO = "llvm-project" mock_build.return_value = None diff --git a/clang_github_tracker/tests/test_publisher.py b/clang_github_tracker/tests/test_publisher.py new file mode 100644 index 0000000..946d232 --- /dev/null +++ b/clang_github_tracker/tests/test_publisher.py @@ -0,0 +1,150 @@ +"""Tests for clang_github_tracker.publisher.publish_clang_markdown.""" + +import subprocess +from pathlib import Path +from unittest.mock import patch + +import pytest +from django.core.management.base import CommandError +from django.test import override_settings + +from clang_github_tracker.publisher import publish_clang_markdown + + +@pytest.fixture +def raw_and_md(tmp_path: Path): + raw = tmp_path / "raw" + raw.mkdir() + md = tmp_path / "md_export" + md.mkdir() + clone_root = raw / "clang_github_tracker" / "acme" / "priv" + clone_root.mkdir(parents=True) + (clone_root / ".git").mkdir() + return raw, md, clone_root + + +def _author_settings(raw: Path): + return override_settings( + RAW_DIR=raw, + GIT_AUTHOR_NAME="Test", + GIT_AUTHOR_EMAIL="test@example.com", + ) + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_success_copies_and_pushes( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + """Happy path: overlay md_export into clone and call git_push.""" + raw, md, clone_root = raw_and_md + sub = md / "issues" / "2024" / "2024-01" + sub.mkdir(parents=True) + f = sub / "#1 - Title.md" + f.write_text("body", encoding="utf-8") + new_files = {"issues/2024/2024-01/#1 - Title.md": str(f)} + + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", new_files) + + copied = clone_root / "issues" / "2024" / "2024-01" / "#1 - Title.md" + assert copied.is_file() + assert copied.read_text(encoding="utf-8") == "body" + mock_push.assert_called_once() + kwargs = mock_push.call_args[1] + assert kwargs["branch"] == "main" + assert kwargs["commit_message"] == "chore: update Clang issues/PRs markdown" + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_push_failure_raises_command_error( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + raw, md, _clone_root = raw_and_md + err = subprocess.CalledProcessError(1, ["git", "push"]) + err.stderr = "rejected" + err.stdout = "" + mock_push.side_effect = err + + with _author_settings(raw): + with pytest.raises(CommandError, match="Git push failed"): + publish_clang_markdown(md, "acme", "priv", "main", {}) + + +@pytest.mark.django_db +def test_publish_clang_markdown_invalid_owner(raw_and_md): + raw, md, _ = raw_and_md + with _author_settings(raw): + with pytest.raises(CommandError, match="Invalid GitHub owner"): + publish_clang_markdown(md, "evil/org", "priv", "main", {}) + + +@pytest.mark.django_db +def test_publish_clang_markdown_overlap_errors(tmp_path: Path): + raw = tmp_path / "raw" + raw.mkdir() + clone = raw / "clang_github_tracker" / "acme" / "priv" + clone.mkdir(parents=True) + (clone / ".git").mkdir() + with _author_settings(raw): + with pytest.raises(CommandError, match="must not overlap"): + publish_clang_markdown(clone, "acme", "priv", "main", {}) + + +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.clone_repo") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_clones_when_no_git_dir( + _token, + mock_clone, + _prepare, + _pull, + _reset, + mock_push, + tmp_path: Path, +): + """Missing .git triggers clone_repo; mock creates minimal repo after rmtree.""" + raw = tmp_path / "raw" + raw.mkdir() + md = tmp_path / "md" + md.mkdir() + clone = raw / "clang_github_tracker" / "acme" / "priv" + clone.mkdir(parents=True) + + def _clone_side_effect(_slug, dest, **_kw): + dest = Path(dest) + if dest.exists(): + import shutil + + shutil.rmtree(dest) + dest.mkdir(parents=True) + (dest / ".git").mkdir() + + mock_clone.side_effect = _clone_side_effect + + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", {}) + mock_clone.assert_called_once() diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py index a72d7f2..6c32924 100644 --- a/clang_github_tracker/tests/test_services.py +++ b/clang_github_tracker/tests/test_services.py @@ -6,7 +6,7 @@ from django.utils import timezone from clang_github_tracker import services as clang_services -from clang_github_tracker.models import ClangGithubIssueItem +from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem @pytest.mark.django_db @@ -40,3 +40,30 @@ def test_watermarks_empty(): assert clang_services.get_issue_item_watermark() is None assert clang_services.get_commit_watermark() is None assert clang_services.start_after_watermark(None) is None + + +@pytest.mark.django_db +def test_upsert_commits_batch_create_and_update(): + sha_a = "a" * 40 + sha_b = "b" * 40 + t0 = timezone.now() - timedelta(days=1) + t1 = timezone.now() + ins, upd = clang_services.upsert_commits_batch([(sha_a, t0), (sha_b, t0)]) + assert ins == 2 and upd == 0 + ins2, upd2 = clang_services.upsert_commits_batch([(sha_a, t1)]) + assert ins2 == 0 and upd2 == 1 + assert ClangGithubCommit.objects.get(sha=sha_a).github_committed_at == t1 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_create_and_update(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + ins, upd = clang_services.upsert_issue_items_batch( + [(10, False, t0, t0), (11, True, t0, t0)] + ) + assert ins == 2 and upd == 0 + ins2, upd2 = clang_services.upsert_issue_items_batch([(10, False, t0, t1)]) + assert ins2 == 0 and upd2 == 1 + row = ClangGithubIssueItem.objects.get(number=10) + assert row.github_updated_at == t1 diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index d4fc848..282e03a 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -38,8 +38,8 @@ def test_resolve_empty_db_no_since_until(): @pytest.mark.django_db -def test_resolve_db_watermark_plus_one_second(): - """Max github fields drive start = max + 1s.""" +def test_resolve_db_watermark_plus_one_millisecond(): + """Max github fields drive start = max + 1ms (API lower bound).""" base = timezone.now() - timedelta(days=1) ClangGithubIssueItem.objects.create( number=1, @@ -52,8 +52,9 @@ def test_resolve_db_watermark_plus_one_second(): github_committed_at=base, ) sc, si, end = clang_state.resolve_start_end_dates(None, None) - assert sc == base + timedelta(seconds=1) - assert si == base + timedelta(seconds=1) + delta = timedelta(milliseconds=1) + assert sc == base + delta + assert si == base + delta @pytest.mark.django_db @@ -70,10 +71,15 @@ def test_resolve_both_since_until_closed_window(): @pytest.mark.django_db def test_resolve_invalid_range_clears_bounds(caplog): """since > until → warning and DB-based resolution.""" + wm = timezone.now() - timedelta(hours=1) ClangGithubIssueItem.objects.create( number=99, is_pull_request=False, - github_updated_at=timezone.now() - timedelta(hours=1), + github_updated_at=wm, + ) + ClangGithubCommit.objects.create( + sha="c" * 40, + github_committed_at=wm, ) since = timezone.now() until = timezone.now() - timedelta(days=1) diff --git a/clang_github_tracker/workspace.py b/clang_github_tracker/workspace.py index dc12814..c7f14ae 100644 --- a/clang_github_tracker/workspace.py +++ b/clang_github_tracker/workspace.py @@ -3,7 +3,7 @@ Layout: workspace/clang_github_activity/ - - md_export/ (generated Markdown for private repo push) + - md_export/ (generated Markdown for GitHub publish) workspace/clang_github_tracker/ - clang_github_tracker_backfill.csv (default CSV backfill path) workspace/raw/github_activity_tracker/// diff --git a/config/settings.py b/config/settings.py index 12a7871..060aa0c 100644 --- a/config/settings.py +++ b/config/settings.py @@ -169,10 +169,10 @@ # ============================================================================= # Clang GitHub Tracker -# Syncs llvm/llvm-project (issues, PRs, commits) to raw workspace only (no DB). -# After sync, updated issues/PRs are exported as Markdown and pushed to the -# private repo below. If OWNER or NAME is not set, upload is skipped and an -# error is logged. +# Syncs llvm/llvm-project (issues, PRs, commits) to raw + DB. +# Markdown export push target: CLANG_GITHUB_CONTEXT_REPO_OWNER / _NAME / _BRANCH +# (separate from CLANG_GITHUB_OWNER + CLANG_GITHUB_REPO, the upstream llvm source). +# If context owner or name is unset, push is skipped and an error is logged. # Folder structure: issues/YYYY/YYYY-MM/#N - title.md (no repo prefix) # ============================================================================= # Boost GitHub owner (used by boost_library_tracker preprocessors for Pinecone sync) @@ -249,15 +249,17 @@ CLANG_GITHUB_REPO = ( env("CLANG_GITHUB_REPO", default="llvm-project") or "llvm-project" ).strip() or "llvm-project" -CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_OWNER", default="") or "" +CLANG_GITHUB_CONTEXT_REPO_OWNER = ( + env("CLANG_GITHUB_CONTEXT_REPO_OWNER", default="") or "" ).strip() -CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_NAME", default="") or "" +CLANG_GITHUB_CONTEXT_REPO_NAME = ( + env("CLANG_GITHUB_CONTEXT_REPO_NAME", default="") or "" ).strip() -CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH = ( - env("CLANG_GITHUB_TRACKER_PRIVATE_REPO_BRANCH", default="main") or "main" +CLANG_GITHUB_CONTEXT_REPO_BRANCH = ( + env("CLANG_GITHUB_CONTEXT_REPO_BRANCH", default="") or "" ).strip() +# Markdown publish: persistent git clone under RAW_DIR/clang_github_tracker///; +# clone/pull/push use GITHUB_TOKEN_WRITE (via get_github_token write); GIT_AUTHOR_* for commits. # GitHub tokens (multiple use cases: scraping, write) # - GITHUB_TOKEN: fallback when a specific token is not set diff --git a/config/test_settings.py b/config/test_settings.py index 7461925..a175bdb 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -66,3 +66,7 @@ # Clang GitHub Tracker (tests use defaults) CLANG_GITHUB_OWNER = "llvm" CLANG_GITHUB_REPO = "llvm-project" +# Do not inherit publish target from developer .env (avoids real git / token in tests). +CLANG_GITHUB_CONTEXT_REPO_OWNER = "" +CLANG_GITHUB_CONTEXT_REPO_NAME = "" +CLANG_GITHUB_CONTEXT_REPO_BRANCH = "" diff --git a/docs/service_api/clang_github_tracker.md b/docs/service_api/clang_github_tracker.md index 8655cbc..f3ab9ae 100644 --- a/docs/service_api/clang_github_tracker.md +++ b/docs/service_api/clang_github_tracker.md @@ -1,6 +1,6 @@ # clang_github_tracker.services -**Module path:** `clang_github_tracker.services` +**Module path:** `clang_github_tracker.services` **Description:** Upserts for `ClangGithubIssueItem` and `ClangGithubCommit` (no FKs to other apps). Used by `sync_clang_github_activity`, `backfill_clang_github_tracker`, and date resolution watermarks. **Type notation:** Models live in `clang_github_tracker.models`. diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index b2f0b75..cd92d26 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -162,7 +162,11 @@ def clone_repo( depth: Optional[int] = None, ) -> None: """ - Clone a GitHub repo. Uses scraping token by default (read-only). + Clone a GitHub repo. + + If ``token`` is omitted, uses the scraping token (``get_github_token(use="scraping")``). + Callers cloning **private** repos must pass ``token=get_github_token(use="write")`` + (or equivalent) so GitHub authenticates with a PAT that has repository access. """ dest_dir = Path(dest_dir) if token is None: @@ -208,7 +212,13 @@ def clone_repo( e.returncode, err_tail, ) - raise + # Never re-raise with the real cmd: it embeds the token in the clone URL. + safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] + if depth is not None: + safe_cmd.extend(["--depth", str(depth)]) + raise subprocess.CalledProcessError( + e.returncode, safe_cmd, e.stdout, e.stderr + ) from None def push( diff --git a/operations/md_ops/github_export.py b/operations/md_ops/github_export.py index 252cc7d..e4ee905 100644 --- a/operations/md_ops/github_export.py +++ b/operations/md_ops/github_export.py @@ -1,12 +1,14 @@ """ Export synced GitHub issues/PRs as Markdown files into a folder structure -suitable for pushing to a private GitHub repository. +suitable for pushing to a target GitHub repository. Public API: write_md_files(owner, repo, issue_numbers, pr_numbers, output_dir, folder_prefix) detect_renames(remote_tree, new_files) -> list[str] detect_renames_from_dirs(owner, repo, branch, new_files, *, token) -> list[str] Use for large repos (100k+ files); lists only the directories we write to. + detect_stale_titled_paths(base_dir, new_files) -> list[str] + Local Path listing (md_export or clone); same #n title-rename rules; no API. Folder structure produced: //issues/YYYY/YYYY-MM/# - .md @@ -170,6 +172,40 @@ def write_md_files( return new_files +def _stale_titled_paths_vs_listing( + new_files: dict[str, str], + files_by_dir: dict[str, list[tuple[str, str]]], + *, + log_prefix: str = "stale_titled", +) -> list[str]: + """Paths to remove: same directory + same #n - prefix as a new file, different filename.""" + if not new_files or not files_by_dir: + return [] + + delete_paths: list[str] = [] + for new_repo_rel in new_files: + new_filename = new_repo_rel.rsplit("/", 1)[-1] + new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" + + m = _NUMBER_PREFIX.match(new_filename) + if not m: + continue + number_str = m.group(1) + prefix = f"#{number_str} - " + + for listed_filename, listed_path in files_by_dir.get(new_dir, []): + if listed_filename.startswith(prefix) and listed_filename != new_filename: + logger.debug( + "%s: %r → %r (title changed, will delete old)", + log_prefix, + listed_path, + new_repo_rel, + ) + delete_paths.append(listed_path) + + return sorted(set(delete_paths)) + + def detect_renames( remote_tree: list[dict], new_files: dict[str, str], @@ -192,7 +228,7 @@ def detect_renames( return [] # Build a lookup: directory → list of (filename, full_path) for blob entries - remote_by_dir: dict[str, list[tuple[str, str]]] = {} + files_by_dir: dict[str, list[tuple[str, str]]] = {} for item in remote_tree: if item.get("type") != "blob": continue @@ -201,29 +237,11 @@ def detect_renames( continue parent = path.rsplit("/", 1)[0] if "/" in path else "" filename = path.rsplit("/", 1)[-1] - remote_by_dir.setdefault(parent, []).append((filename, path)) - - delete_paths: list[str] = [] - for new_repo_rel in new_files: - new_filename = new_repo_rel.rsplit("/", 1)[-1] - new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" - - m = _NUMBER_PREFIX.match(new_filename) - if not m: - continue - number_str = m.group(1) - prefix = f"#{number_str} - " - - for remote_filename, remote_path in remote_by_dir.get(new_dir, []): - if remote_filename.startswith(prefix) and remote_filename != new_filename: - logger.debug( - "detect_renames: %r → %r (title changed, will delete old)", - remote_path, - new_repo_rel, - ) - delete_paths.append(remote_path) + files_by_dir.setdefault(parent, []).append((filename, path)) - return delete_paths + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_renames" + ) def detect_renames_from_dirs( @@ -241,7 +259,7 @@ def detect_renames_from_dirs( only a small number of API calls are made. Args: - owner: Repository owner (e.g. private repo owner). + owner: Repository owner (markdown publish target). repo: Repository name. branch: Branch name. new_files: Dict of {repo_relative_path: local_path} from write_md_files(). @@ -260,32 +278,64 @@ def detect_renames_from_dirs( else: dirs.add("") - delete_paths: list[str] = [] + files_by_dir: dict[str, list[tuple[str, str]]] = {} for dir_path in sorted(dirs): - remote_paths = list_remote_directory(owner, repo, branch, dir_path, token=token) - for remote_path in remote_paths: + for remote_path in list_remote_directory( + owner, repo, branch, dir_path, token=token + ): filename = remote_path.rsplit("/", 1)[-1] - m = _NUMBER_PREFIX.match(filename) - if not m: - continue - number_str = m.group(1) - prefix = f"#{number_str} - " remote_dir = remote_path.rsplit("/", 1)[0] if "/" in remote_path else "" - for new_repo_rel in new_files: - new_dir = new_repo_rel.rsplit("/", 1)[0] if "/" in new_repo_rel else "" - if new_dir != remote_dir: - continue - new_filename = new_repo_rel.rsplit("/", 1)[-1] - if new_filename.startswith(prefix) and new_filename != filename: - logger.debug( - "detect_renames_from_dirs: %r → %r (title changed, will delete old)", - remote_path, - new_repo_rel, - ) - delete_paths.append(remote_path) - break - - return delete_paths + files_by_dir.setdefault(remote_dir, []).append((filename, remote_path)) + + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_renames_from_dirs" + ) + + +def detect_stale_titled_paths( + base_dir: Path, + new_files: dict[str, str], +) -> list[str]: + """Find paths under base_dir to delete (old title) using local directory listings. + + Same rules as detect_renames_from_dirs, but lists each affected directory with + Path.iterdir (no GitHub API). Use on md_export and on a clone after pull. + + Args: + base_dir: Root to resolve paths against (md_export root or repo clone root). + new_files: Dict of {repo_relative_path: local_path} from write_md_files(). + + Returns: + Paths relative to base_dir (posix) that should be unlinked. + """ + base_dir = base_dir.resolve() + if not new_files: + return [] + + dirs: set[str] = set() + for repo_rel in new_files: + if "/" in repo_rel: + dirs.add(repo_rel.rsplit("/", 1)[0]) + else: + dirs.add("") + + files_by_dir: dict[str, list[tuple[str, str]]] = {} + for dir_path in dirs: + scan = base_dir if dir_path == "" else base_dir / dir_path + if not scan.is_dir(): + continue + for p in scan.iterdir(): + if p.name.startswith(".") or p.name == ".git": + continue + if not p.is_file() or p.suffix.lower() != ".md": + continue + rel = p.relative_to(base_dir).as_posix() + parent = rel.rsplit("/", 1)[0] if "/" in rel else "" + files_by_dir.setdefault(parent, []).append((p.name, rel)) + + return _stale_titled_paths_vs_listing( + new_files, files_by_dir, log_prefix="detect_stale_titled_paths" + ) def _parse_dt(value: object) -> Optional[datetime]: diff --git a/operations/tests/test_github_export.py b/operations/tests/test_github_export.py new file mode 100644 index 0000000..c483dc1 --- /dev/null +++ b/operations/tests/test_github_export.py @@ -0,0 +1,111 @@ +"""Tests for operations.md_ops.github_export rename and stale-path helpers.""" + +from pathlib import Path +from unittest.mock import MagicMock, patch + +from operations.md_ops.github_export import ( + detect_renames, + detect_renames_from_dirs, + detect_stale_titled_paths, +) + + +def test_detect_renames_from_dirs_empty_new_files(): + """No new files means nothing to compare.""" + assert detect_renames_from_dirs("o", "r", "main", {}, token="t") == [] + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_finds_old_title(mock_list_remote: MagicMock): + """Remote dir lists old filename; new_files has new title for same #5.""" + mock_list_remote.return_value = [ + "issues/2024/2024-03/#5 - Old title.md", + ] + new_files = { + "issues/2024/2024-03/#5 - New title.md": "/tmp/x", + } + out = detect_renames_from_dirs("own", "repo", "main", new_files, token="tok") + assert out == ["issues/2024/2024-03/#5 - Old title.md"] + mock_list_remote.assert_called() + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_no_conflict(mock_list_remote: MagicMock): + """Remote only has the same filename as new_files.""" + mock_list_remote.return_value = [ + "issues/2024/2024-03/#5 - Same.md", + ] + new_files = {"issues/2024/2024-03/#5 - Same.md": "/tmp/x"} + assert detect_renames_from_dirs("o", "r", "main", new_files, token="t") == [] + + +@patch("operations.md_ops.github_export.list_remote_directory") +def test_detect_renames_from_dirs_non_numbered_md_ignored(mock_list_remote: MagicMock): + """Files not matching #n - prefix are ignored.""" + mock_list_remote.return_value = ["issues/2024/2024-03/README.md"] + new_files = {"issues/2024/2024-03/#5 - T.md": "/tmp/x"} + assert detect_renames_from_dirs("o", "r", "main", new_files, token="t") == [] + + +def test_detect_renames_success_matches_tree(): + """detect_renames uses same semantics as directory listing.""" + tree = [ + {"type": "blob", "path": "issues/2024/2024-03/#5 - Old title.md"}, + ] + new_files = {"issues/2024/2024-03/#5 - New title.md": "/x"} + assert detect_renames(tree, new_files) == ["issues/2024/2024-03/#5 - Old title.md"] + + +def test_detect_renames_empty_tree(): + assert detect_renames([], {"a/b.md": "/x"}) == [] + + +def test_detect_stale_titled_paths_finds_old_file_on_disk(tmp_path: Path): + """Local directory has old title; new_files points to new title.""" + d = tmp_path / "issues" / "2024" / "2024-03" + d.mkdir(parents=True) + old = d / "#5 - Old title.md" + old.write_text("old", encoding="utf-8") + new_files = {"issues/2024/2024-03/#5 - New title.md": str(d / "#5 - New title.md")} + stale = detect_stale_titled_paths(tmp_path, new_files) + assert stale == ["issues/2024/2024-03/#5 - Old title.md"] + + +def test_detect_stale_titled_paths_only_canonical(tmp_path: Path): + """Only the new filename present → no stale paths.""" + d = tmp_path / "pull_requests" / "2024" / "2024-01" + d.mkdir(parents=True) + f = d / "#10 - Only.md" + f.write_text("x", encoding="utf-8") + new_files = {"pull_requests/2024/2024-01/#10 - Only.md": str(f)} + assert detect_stale_titled_paths(tmp_path, new_files) == [] + + +def test_detect_stale_titled_paths_missing_month_dir(tmp_path: Path): + """Missing directory is treated as empty.""" + new_files = {"issues/2024/2024-99/#1 - A.md": "/nope"} + assert detect_stale_titled_paths(tmp_path, new_files) == [] + + +def test_detect_stale_titled_paths_empty_new_files(tmp_path: Path): + assert detect_stale_titled_paths(tmp_path, {}) == [] + + +def test_detect_stale_titled_paths_union_two_dirs(tmp_path: Path): + """Multiple parent dirs each with stale file.""" + for sub, old_name in ( + ("issues/2024/2024-01", "#1 - Old.md"), + ("issues/2024/2024-02", "#2 - Was.md"), + ): + p = tmp_path / sub + p.mkdir(parents=True) + (p / old_name).write_text("o", encoding="utf-8") + new_files = { + "issues/2024/2024-01/#1 - New.md": "/a", + "issues/2024/2024-02/#2 - Now.md": "/b", + } + stale = set(detect_stale_titled_paths(tmp_path, new_files)) + assert stale == { + "issues/2024/2024-01/#1 - Old.md", + "issues/2024/2024-02/#2 - Was.md", + } From 113713b4791c50cfa0d609f7a7eff6f3ad4292bc Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Wed, 1 Apr 2026 23:35:34 -0400 Subject: [PATCH 63/76] Fix: lint/format error - #136 --- clang_github_tracker/publisher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index 6472c05..dadc893 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -50,9 +50,7 @@ def _reset_hard_to_upstream(clone_dir: Path, remote: str, branch: str) -> None: ) except subprocess.CalledProcessError as e: err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) - raise CommandError( - f"Could not reset clone to {ref}: {err}" - ) from e + raise CommandError(f"Could not reset clone to {ref}: {err}") from e def _copy_md_tree(md_output_dir: Path, clone_dir: Path) -> None: @@ -141,7 +139,9 @@ def publish_clang_markdown( "that repository. GitHub often returns 'not found' when the token " "lacks access." ) - logger.error("clang_github_tracker publish: git clone failed: %s", tail or e) + logger.error( + "clang_github_tracker publish: git clone failed: %s", tail or e + ) raise CommandError( f"Git clone failed for {repo_slug}: {tail or e.returncode}. {hint}" ) from e From dc5a43b2b3329c4f8cccaee2e5ae699bc5676b83 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Thu, 2 Apr 2026 10:21:28 -0400 Subject: [PATCH 64/76] fix(clang): docstrings, batch updated_at, CSV/CLI/Pinecone fixes; redact git clone logs; ignore /nul - #136 --- .gitignore | 1 + clang_github_tracker/__init__.py | 2 +- clang_github_tracker/apps.py | 4 ++ .../commands/backfill_clang_github_tracker.py | 15 +++++ .../commands/run_clang_github_tracker.py | 37 ++++++----- clang_github_tracker/models.py | 4 ++ .../preprocessors/__init__.py | 1 + clang_github_tracker/publisher.py | 8 ++- clang_github_tracker/services.py | 13 +++- clang_github_tracker/state_manager.py | 1 + clang_github_tracker/tests/test_backfill.py | 16 +++++ clang_github_tracker/tests/test_commands.py | 62 ++++++++++++++++--- clang_github_tracker/tests/test_services.py | 11 +++- .../tests/test_state_manager.py | 4 +- github_ops/git_ops.py | 24 ++++++- github_ops/tests/test_git_ops.py | 23 +++++++ 16 files changed, 188 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index aeb51c0..d264443 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ discord_activity_tracker/tools/ config/boost_collector_schedule.yaml # temp files temp/ +nul \ No newline at end of file diff --git a/clang_github_tracker/__init__.py b/clang_github_tracker/__init__.py index 6337852..10b9490 100644 --- a/clang_github_tracker/__init__.py +++ b/clang_github_tracker/__init__.py @@ -1 +1 @@ -"""Fetch GitHub activity for a configurable repo to raw JSON only (no DB).""" +"""Clang GitHub tracker: sync configured repo to raw JSON and tracker DB tables.""" diff --git a/clang_github_tracker/apps.py b/clang_github_tracker/apps.py index aec5d50..2faffb3 100644 --- a/clang_github_tracker/apps.py +++ b/clang_github_tracker/apps.py @@ -1,7 +1,11 @@ +"""Django app config for clang_github_tracker.""" + from django.apps import AppConfig class ClangGithubTrackerConfig(AppConfig): + """Registers the clang_github_tracker application.""" + default_auto_field = "django.db.models.BigAutoField" name = "clang_github_tracker" verbose_name = "Clang GitHub Tracker" diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py index 9657754..bce2f32 100644 --- a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -34,6 +34,7 @@ def _commit_date_from_json(data: dict): + """Parse commit author/committer date from a GitHub API-style JSON dict.""" commit = data.get("commit") or {} author = commit.get("author") or commit.get("committer") or {} date_str = author.get("date") @@ -43,6 +44,8 @@ def _commit_date_from_json(data: dict): class Command(BaseCommand): + """Load ``ClangGithubIssueItem`` / ``ClangGithubCommit`` from CSV or raw JSON.""" + help = ( "Backfill clang_github_tracker DB from CSV (--from-csv) or raw JSON dirs (--from-raw). " "CSV columns: record_type (issue|pr|commit), number, github_created_at, github_updated_at, " @@ -50,6 +53,7 @@ class Command(BaseCommand): ) def add_arguments(self, parser): + """Add mutually exclusive ``--from-csv`` and ``--from-raw`` options.""" group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "--from-csv", @@ -69,6 +73,7 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): + """Dispatch to CSV or raw-directory backfill.""" if options.get("from_raw"): self._backfill_from_raw() return @@ -77,6 +82,7 @@ def handle(self, *args, **options): self._backfill_from_csv(path) def _backfill_from_csv(self, path: Path) -> None: + """Parse CSV at ``path`` and batch-upsert issues, PRs, and commits.""" if not path.is_file(): raise CommandError(f"CSV not found: {path}") commit_rows: list[tuple[str, datetime | None]] = [] @@ -91,6 +97,10 @@ def _backfill_from_csv(self, path: Path) -> None: try: if rt == "issue": num = int((row.get("number") or "").strip()) + if num <= 0: + logger.warning("skip issue row: invalid number %r", num) + skipped += 1 + continue gc = parse_datetime( (row.get("github_created_at") or "").strip() ) @@ -100,6 +110,10 @@ def _backfill_from_csv(self, path: Path) -> None: issue_rows.append((num, False, gc, gu)) elif rt == "pr": num = int((row.get("number") or "").strip()) + if num <= 0: + logger.warning("skip pr row: invalid number %r", num) + skipped += 1 + continue gc = parse_datetime( (row.get("github_created_at") or "").strip() ) @@ -138,6 +152,7 @@ def _backfill_from_csv(self, path: Path) -> None: ) def _backfill_from_raw(self) -> None: + """Scan ``commits`` / ``issues`` / ``prs`` JSON under the raw repo dir and upsert.""" root = get_raw_repo_dir(OWNER, REPO, create=False) if not root.is_dir(): raise CommandError(f"Raw repo dir missing: {root}") diff --git a/clang_github_tracker/management/commands/run_clang_github_tracker.py b/clang_github_tracker/management/commands/run_clang_github_tracker.py index e079ce1..59507e6 100644 --- a/clang_github_tracker/management/commands/run_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/run_clang_github_tracker.py @@ -68,6 +68,7 @@ class Command(BaseCommand): ) def add_arguments(self, parser): + """Define dry-run, skip flags, and optional ``--since`` / ``--until`` window.""" parser.add_argument( "--dry-run", action="store_true", @@ -115,6 +116,7 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): + """Resolve sync window, then run GitHub fetch, Markdown, push, and Pinecone as configured.""" dry_run = options["dry_run"] skip_github_sync = options["skip_github_sync"] skip_markdown_export = options["skip_markdown_export"] @@ -220,22 +222,28 @@ def handle(self, *args, **options): if not skip_pinecone: app_type = (settings.CLANG_GITHUB_PINECONE_APP_TYPE or "").strip() namespace = (settings.CLANG_GITHUB_PINECONE_NAMESPACE or "").strip() - _run_pinecone_sync( - f"{app_type}-issues", - namespace, - "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", - ) - _run_pinecone_sync( - f"{app_type}-prs", - namespace, - "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", - ) + if not app_type: + logger.warning( + "Pinecone sync skipped: CLANG_GITHUB_PINECONE_APP_TYPE is empty (settings/env)." + ) + else: + _run_pinecone_sync( + f"{app_type}-issues", + namespace, + "clang_github_tracker.preprocessors.issue_preprocessor.preprocess_for_pinecone", + ) + _run_pinecone_sync( + f"{app_type}-prs", + namespace, + "clang_github_tracker.preprocessors.pr_preprocessor.preprocess_for_pinecone", + ) else: logger.info("skipping Pinecone (--skip-pinecone)") logger.info("run_clang_github_tracker finished successfully") def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None: + """Publish ``md_export`` to ``CLANG_GITHUB_CONTEXT_*`` and remove local run artifacts.""" clang_github_context_repo_owner = getattr( settings, "CLANG_GITHUB_CONTEXT_REPO_OWNER", "" ).strip() @@ -243,13 +251,8 @@ def _push_markdown(self, md_output_dir: Path, new_files: dict[str, str]) -> None settings, "CLANG_GITHUB_CONTEXT_REPO_NAME", "" ).strip() clang_github_context_repo_branch = ( - getattr( - settings, - "CLANG_GITHUB_CONTEXT_REPO_BRANCH", - DEFAULT_CLANG_REPO_BRANCH, - ) - or DEFAULT_CLANG_REPO_BRANCH - ).strip() + getattr(settings, "CLANG_GITHUB_CONTEXT_REPO_BRANCH", "") or "" + ).strip() or DEFAULT_CLANG_REPO_BRANCH if not clang_github_context_repo_owner or not clang_github_context_repo_name: logger.error( "CLANG_GITHUB_CONTEXT_REPO_OWNER / CLANG_GITHUB_CONTEXT_REPO_NAME " diff --git a/clang_github_tracker/models.py b/clang_github_tracker/models.py index bcbb3cb..031f9fe 100644 --- a/clang_github_tracker/models.py +++ b/clang_github_tracker/models.py @@ -25,6 +25,8 @@ class ClangGithubIssueItem(models.Model): ) class Meta: + """Maps to ``clang_github_tracker_issue_item``.""" + db_table = "clang_github_tracker_issue_item" @@ -37,4 +39,6 @@ class ClangGithubCommit(models.Model): updated_at = models.DateTimeField(auto_now=True) class Meta: + """Maps to ``clang_github_tracker_commit``.""" + db_table = "clang_github_tracker_commit" diff --git a/clang_github_tracker/preprocessors/__init__.py b/clang_github_tracker/preprocessors/__init__.py index e69de29..2645d38 100644 --- a/clang_github_tracker/preprocessors/__init__.py +++ b/clang_github_tracker/preprocessors/__init__.py @@ -0,0 +1 @@ +"""Pinecone preprocessor modules for clang_github_tracker.""" diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index dadc893..4dc929c 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -108,10 +108,12 @@ def publish_clang_markdown( # Private CLANG_GITHUB_CONTEXT_* repos need a PAT that can read them (clone/pull) # and push; get_github_token("write") uses GITHUB_TOKEN_WRITE or GITHUB_TOKEN. token = get_github_token(use="write") - git_user_name = (getattr(settings, "GIT_AUTHOR_NAME", None) or "unknown").strip() + git_user_name = ( + getattr(settings, "GIT_AUTHOR_NAME", None) or "" + ).strip() or "unknown" git_user_email = ( - getattr(settings, "GIT_AUTHOR_EMAIL", None) or "unknown@noreply.github.com" - ).strip() + getattr(settings, "GIT_AUTHOR_EMAIL", None) or "" + ).strip() or "unknown@noreply.github.com" repo_slug = f"{owner}/{repo}" logger.info("Publishing Clang markdown to %s (%s)...", repo_slug, branch) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index b8ba84f..d73117a 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -8,6 +8,7 @@ from typing import Optional from django.db.models import Max +from django.utils import timezone from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem @@ -72,13 +73,17 @@ def _flush_commits_chunk( existing = set( ClangGithubCommit.objects.filter(sha__in=shas).values_list("sha", flat=True) ) - objs = [ClangGithubCommit(sha=s, github_committed_at=dt) for s, dt in pairs] + now = timezone.now() + objs = [ + ClangGithubCommit(sha=s, github_committed_at=dt, updated_at=now) + for s, dt in pairs + ] ClangGithubCommit.objects.bulk_create( objs, batch_size=len(objs), update_conflicts=True, unique_fields=["sha"], - update_fields=["github_committed_at"], + update_fields=["github_committed_at", "updated_at"], ) inserted = sum(1 for s, _ in pairs if s not in existing) updated = len(pairs) - inserted @@ -113,6 +118,7 @@ def upsert_commits_batch( def _flush_issue_items_chunk( rows: list[tuple[int, bool, datetime | None, datetime | None]], ) -> tuple[int, int]: + """Bulk upsert one chunk of issue/PR rows; returns (inserted, updated).""" if not rows: return 0, 0 nums = [n for n, _, _, _ in rows] @@ -121,12 +127,14 @@ def _flush_issue_items_chunk( "number", flat=True ) ) + now = timezone.now() objs = [ ClangGithubIssueItem( number=n, is_pull_request=is_pr, github_created_at=gc, github_updated_at=gu, + updated_at=now, ) for n, is_pr, gc, gu in rows ] @@ -139,6 +147,7 @@ def _flush_issue_items_chunk( "is_pull_request", "github_created_at", "github_updated_at", + "updated_at", ], ) inserted = sum(1 for n, _, _, _ in rows if n not in existing) diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index 5c6ba1f..a12e352 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -31,6 +31,7 @@ def parse_iso(s: str | None) -> datetime | None: def _aware_utc(dt: datetime | None) -> datetime | None: + """Normalize ``dt`` to timezone-aware UTC, or return ``None``.""" if dt is None: return None if timezone.is_naive(dt): diff --git a/clang_github_tracker/tests/test_backfill.py b/clang_github_tracker/tests/test_backfill.py index 61138de..7bfe9c3 100644 --- a/clang_github_tracker/tests/test_backfill.py +++ b/clang_github_tracker/tests/test_backfill.py @@ -25,6 +25,22 @@ def test_backfill_csv(tmp_path): assert ClangGithubCommit.objects.filter(sha="a" * 40).exists() +@pytest.mark.django_db +def test_backfill_csv_skips_non_positive_issue_pr_numbers(tmp_path): + csv_path = tmp_path / "bad_nums.csv" + csv_path.write_text( + "record_type,number,github_created_at,github_updated_at,sha,github_committed_at\n" + "issue,0,,,,\n" + "issue,-3,,,,\n" + "pr,-1,,,,\n" + "issue,5,2024-01-01T00:00:00Z,2024-01-02T00:00:00Z,,\n", + encoding="utf-8", + ) + call_command("backfill_clang_github_tracker", f"--from-csv={csv_path}") + assert ClangGithubIssueItem.objects.filter(number=5, is_pull_request=False).exists() + assert ClangGithubIssueItem.objects.count() == 1 + + @pytest.mark.django_db def test_backfill_from_raw(tmp_path, monkeypatch): root = tmp_path / "raw" / OWNER / REPO diff --git a/clang_github_tracker/tests/test_commands.py b/clang_github_tracker/tests/test_commands.py index 990b100..8431c65 100644 --- a/clang_github_tracker/tests/test_commands.py +++ b/clang_github_tracker/tests/test_commands.py @@ -26,16 +26,24 @@ def test_run_clang_github_tracker_dry_run_logs_resolved(caplog): @pytest.mark.django_db -def test_run_clang_github_tracker_dry_run_skip_sync(caplog): - """Dry run with --skip-github-sync still logs resolved window.""" - with caplog.at_level(logging.INFO): - call_command( - CMD_NAME, - "--dry-run", - "--skip-github-sync", - stdout=StringIO(), - stderr=StringIO(), - ) +def test_run_clang_github_tracker_skip_sync(caplog): + """--skip-github-sync bypasses the GitHub sync step (not only under --dry-run).""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity" + ) as sync_mock, patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + with caplog.at_level(logging.INFO): + call_command( + CMD_NAME, + "--skip-github-sync", + "--skip-pinecone", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + sync_mock.assert_not_called() assert any("Resolved:" in r.getMessage() for r in caplog.records) @@ -113,6 +121,40 @@ def test_run_clang_github_tracker_skip_pinecone(caplog): assert not pinecone_calls +@pytest.mark.django_db +@override_settings(CLANG_GITHUB_PINECONE_APP_TYPE="") +def test_run_clang_github_tracker_empty_pinecone_app_type_skips_sync(caplog): + """Empty CLANG_GITHUB_PINECONE_APP_TYPE must not call run_cppa_pinecone_sync with -issues/-prs.""" + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.sync_clang_github_activity", + return_value=(0, [1], []), + ): + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.call_command" + ) as cc: + with patch( + "clang_github_tracker.management.commands.run_clang_github_tracker.write_md_files", + return_value={}, + ): + with caplog.at_level(logging.WARNING): + call_command( + CMD_NAME, + "--since=2024-01-01", + "--until=2024-01-02", + "--skip-remote-push", + stdout=StringIO(), + stderr=StringIO(), + ) + pinecone_calls = [ + c for c in cc.call_args_list if c[0] and c[0][0] == "run_cppa_pinecone_sync" + ] + assert not pinecone_calls + assert any( + "CLANG_GITHUB_PINECONE_APP_TYPE is empty" in r.getMessage() + for r in caplog.records + ) + + @pytest.mark.django_db @override_settings( CLANG_GITHUB_CONTEXT_REPO_OWNER="myorg", diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py index 6c32924..aa60436 100644 --- a/clang_github_tracker/tests/test_services.py +++ b/clang_github_tracker/tests/test_services.py @@ -50,9 +50,13 @@ def test_upsert_commits_batch_create_and_update(): t1 = timezone.now() ins, upd = clang_services.upsert_commits_batch([(sha_a, t0), (sha_b, t0)]) assert ins == 2 and upd == 0 + row = ClangGithubCommit.objects.get(sha=sha_a) + first_updated = row.updated_at ins2, upd2 = clang_services.upsert_commits_batch([(sha_a, t1)]) assert ins2 == 0 and upd2 == 1 - assert ClangGithubCommit.objects.get(sha=sha_a).github_committed_at == t1 + row.refresh_from_db() + assert row.github_committed_at == t1 + assert row.updated_at > first_updated @pytest.mark.django_db @@ -63,7 +67,10 @@ def test_upsert_issue_items_batch_create_and_update(): [(10, False, t0, t0), (11, True, t0, t0)] ) assert ins == 2 and upd == 0 + row = ClangGithubIssueItem.objects.get(number=10) + first_updated = row.updated_at ins2, upd2 = clang_services.upsert_issue_items_batch([(10, False, t0, t1)]) assert ins2 == 0 and upd2 == 1 - row = ClangGithubIssueItem.objects.get(number=10) + row.refresh_from_db() assert row.github_updated_at == t1 + assert row.updated_at > first_updated diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 282e03a..7b47ee6 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -51,7 +51,7 @@ def test_resolve_db_watermark_plus_one_millisecond(): sha="a" * 40, github_committed_at=base, ) - sc, si, end = clang_state.resolve_start_end_dates(None, None) + sc, si, _end = clang_state.resolve_start_end_dates(None, None) delta = timedelta(milliseconds=1) assert sc == base + delta assert si == base + delta @@ -100,6 +100,6 @@ def test_resolve_since_floor_without_until(): github_updated_at=base, ) since = timezone.now() - timedelta(days=1) - sc, si, _end = clang_state.resolve_start_end_dates(since, None) + _sc, si, _end = clang_state.resolve_start_end_dates(since, None) assert si is not None assert si >= since diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index cd92d26..911d7ca 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -154,6 +154,27 @@ def _url_with_token(url: str, token: str) -> str: ) +def sanitize_git_output(text: str) -> str: + """Redact credentials from git stderr/stdout snippets before logging. + + Masks GitHub HTTPS PAT forms and other userinfo-in-URL patterns so logs do not + leak tokens when clone/push echoes the remote URL. + """ + if not text: + return text + out = re.sub( + r"(?i)(x-access-token:)[^@\s]+(@)", + r"\1***\2", + text, + ) + out = re.sub( + r"(?i)(https?://)[^/\s?#]+@", + r"\1<redacted>@", + out, + ) + return out + + def clone_repo( url_or_slug: str, dest_dir: str | Path, @@ -205,12 +226,13 @@ def clone_repo( raise except subprocess.CalledProcessError as e: err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) logger.warning( "git clone failed (%s -> %s), returncode=%s, stderr/stdout_tail=%r", url_or_slug, dest_dir, e.returncode, - err_tail, + safe_err_tail, ) # Never re-raise with the real cmd: it embeds the token in the clone URL. safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 7a71d54..eded244 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -12,6 +12,7 @@ pull, get_commit_file_changes, push, + sanitize_git_output, upload_folder_to_github, ) @@ -45,6 +46,28 @@ def test_url_with_token_only_replaces_first_occurrence(): assert out == "https://x-access-token:tok@github.com/boostorg/boost.git" +# --- sanitize_git_output --- + + +def test_sanitize_git_output_masks_x_access_token(): + raw = "fatal: https://x-access-token:ghp_SUPER_SECRET@github.com/o/r.git not found" + out = sanitize_git_output(raw) + assert "ghp_SUPER_SECRET" not in out + assert "x-access-token:ghp_" not in out + assert "https://<redacted>@github.com" in out + + +def test_sanitize_git_output_masks_bare_token_userinfo(): + raw = "error cloning https://github_pat_XXXX@github.com/foo/bar.git" + out = sanitize_git_output(raw) + assert "github_pat_XXXX" not in out + assert "https://<redacted>@" in out + + +def test_sanitize_git_output_empty(): + assert sanitize_git_output("") == "" + + # --- clone_repo --- From 2a1f4ded7f3ec66cbb1961311348d76842337e5a Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Thu, 2 Apr 2026 10:23:24 -0400 Subject: [PATCH 65/76] Fix: lint/format error --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d264443..d9be1a3 100644 --- a/.gitignore +++ b/.gitignore @@ -52,4 +52,4 @@ discord_activity_tracker/tools/ config/boost_collector_schedule.yaml # temp files temp/ -nul \ No newline at end of file +nul From 9249784f599f709954776c8379bdab8ebfaab1b9 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Thu, 2 Apr 2026 13:04:07 -0400 Subject: [PATCH 66/76] fix(clang): max-merge duplicate rows in batch upserts; doc start_after_watermark as +1 ms - #136 --- clang_github_tracker/services.py | 28 +++++++++++++-- clang_github_tracker/tests/test_services.py | 40 +++++++++++++++++++++ docs/service_api/clang_github_tracker.md | 2 +- 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index d73117a..1b0d7f9 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -17,6 +17,15 @@ DEFAULT_UPSERT_BATCH_SIZE = 500 +def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | None: + """Return the later of two datetimes; ``None`` is treated as missing (never wins over a value).""" + if current is None: + return incoming + if incoming is None: + return current + return max(current, incoming) + + def upsert_issue_item( number: int, *, @@ -105,7 +114,7 @@ def upsert_commits_batch( s = (sha or "").strip() if len(s) != 40: continue - merged[s] = dt + merged[s] = _max_dt(merged.get(s), dt) inserted = updated = 0 items = list(merged.items()) for i in range(0, len(items), batch_size): @@ -160,7 +169,11 @@ def upsert_issue_items_batch( *, batch_size: int = DEFAULT_UPSERT_BATCH_SIZE, ) -> tuple[int, int]: - """Batch upsert issue/PR rows by ``number``. Later rows win on duplicate numbers. + """Batch upsert issue/PR rows by ``number``. + + Duplicate ``number`` values merge: ``github_updated_at`` uses the latest + timestamp; ``github_created_at`` keeps the first non-None; ``is_pull_request`` + is True if any row marks the number as a PR. Returns: (inserted, updated) counts across all batches. @@ -169,7 +182,16 @@ def upsert_issue_items_batch( for num, is_pr, gc, gu in rows: if not isinstance(num, int) or num <= 0: continue - merged[num] = (is_pr, gc, gu) + prev = merged.get(num) + if prev is None: + merged[num] = (is_pr, gc, gu) + else: + prev_is_pr, prev_gc, prev_gu = prev + merged[num] = ( + prev_is_pr or is_pr, + prev_gc if prev_gc is not None else gc, + _max_dt(prev_gu, gu), + ) inserted = updated = 0 items = [(n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items())] for i in range(0, len(items), batch_size): diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py index aa60436..d79ce97 100644 --- a/clang_github_tracker/tests/test_services.py +++ b/clang_github_tracker/tests/test_services.py @@ -74,3 +74,43 @@ def test_upsert_issue_items_batch_create_and_update(): row.refresh_from_db() assert row.github_updated_at == t1 assert row.updated_at > first_updated + + +@pytest.mark.django_db +def test_upsert_commits_batch_duplicate_sha_keeps_latest_committed_at(): + sha = "c" * 40 + t_new = timezone.now() + t_old = t_new - timedelta(days=7) + clang_services.upsert_commits_batch([(sha, t_new), (sha, t_old)]) + row = ClangGithubCommit.objects.get(sha=sha) + assert row.github_committed_at == t_new + + +@pytest.mark.django_db +def test_upsert_commits_batch_duplicate_sha_none_does_not_wipe_timestamp(): + sha = "d" * 40 + t0 = timezone.now() - timedelta(hours=1) + clang_services.upsert_commits_batch([(sha, t0), (sha, None)]) + assert ClangGithubCommit.objects.get(sha=sha).github_committed_at == t0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_duplicate_number_keeps_latest_github_updated_at(): + t_base = timezone.now() - timedelta(days=5) + t_new = timezone.now() + t_old = t_new - timedelta(days=1) + clang_services.upsert_issue_items_batch( + [ + (7, False, t_base, t_new), + (7, False, t_base, t_old), + ] + ) + row = ClangGithubIssueItem.objects.get(number=7) + assert row.github_updated_at == t_new + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_duplicate_merges_is_pr_or(): + t0 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(8, False, t0, t0), (8, True, t0, t0)]) + assert ClangGithubIssueItem.objects.get(number=8).is_pull_request is True diff --git a/docs/service_api/clang_github_tracker.md b/docs/service_api/clang_github_tracker.md index f3ab9ae..3e53ff5 100644 --- a/docs/service_api/clang_github_tracker.md +++ b/docs/service_api/clang_github_tracker.md @@ -22,7 +22,7 @@ | -------- | ------ | ----- | | `get_issue_item_watermark` | `datetime \| None` | `Max(github_updated_at)` over all issue/PR rows (unified issues+PR stream). | | `get_commit_watermark` | `datetime \| None` | `Max(github_committed_at)` over commits. | -| `start_after_watermark` | `datetime \| None` | `max_dt + timedelta(seconds=1)` or `None` if `max_dt` is `None`. | +| `start_after_watermark` | `datetime \| None` | `max_dt + timedelta(milliseconds=1)` or `None` if `max_dt` is `None`. | Used by `clang_github_tracker.state_manager.resolve_start_end_dates` (with optional CLI `--since` / `--until` bounds). From abe1519101b36892c791abefbe8a895a46f0e129 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 10:35:46 -0400 Subject: [PATCH 67/76] Fix: map prepare/pull git errors to CommandError - #136 --- clang_github_tracker/publisher.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index 4dc929c..2889823 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -149,10 +149,33 @@ def publish_clang_markdown( ) from e logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) - prepare_repo_for_pull(clone_dir, remote="origin", token=token) + try: + prepare_repo_for_pull(clone_dir, remote="origin", token=token) + except subprocess.CalledProcessError as e: + err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + logger.error( + "clang_github_tracker publish: prepare clone for pull failed " + "(clone_dir=%s, branch=%s): %s", + clone_dir, + branch, + err, + exc_info=e, + ) + raise CommandError(f"Failed to prepare clone for pull: {err}") from e logger.info("Pulling latest for %s", clone_dir) - pull(clone_dir, branch=branch, token=token) + try: + pull(clone_dir, branch=branch, token=token) + except subprocess.CalledProcessError as e: + err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + logger.error( + "clang_github_tracker publish: git pull failed (clone_dir=%s, branch=%s): %s", + clone_dir, + branch, + err, + exc_info=e, + ) + raise CommandError(f"Git pull failed: {err}") from e logger.info("Resetting clone to origin/%s (discard unpushed commits)", branch) _reset_hard_to_upstream(clone_dir, "origin", branch) From 4968e5cea78017d5ba151da7a34f2f76ed0850eb Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 13:12:19 -0400 Subject: [PATCH 68/76] fix: defensive clang GitHub upserts, Docker safe.directory, and redacted git clone errors - #136 --- Dockerfile | 4 + clang_github_tracker/publisher.py | 7 +- clang_github_tracker/services.py | 98 ++++++++++++++----- .../tests/test_preprocessors.py | 9 +- clang_github_tracker/tests/test_services.py | 70 +++++++++++++ .../tests/test_state_manager.py | 9 +- github_ops/git_ops.py | 6 +- 7 files changed, 169 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5b7b51c..a2c10b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,6 +33,10 @@ RUN chmod +x /app/docker-entrypoint.sh # Entrypoint runs as root, chowns mounted dirs, then exec's CMD as appuser via gosu RUN useradd --create-home appuser && chown -R appuser /app +# Git 2.35+ blocks repos when directory owner != current user; bind mounts often +# disagree (e.g. Docker Desktop on Windows). System config applies to root and appuser +# (e.g. docker exec as root vs gosu appuser in entrypoint). +RUN git config --system --add safe.directory '*' ENTRYPOINT ["/app/docker-entrypoint.sh"] # Container starts as root so entrypoint can chown; CMD runs as appuser via gosu diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index 2889823..965c92a 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -11,7 +11,12 @@ from django.conf import settings from django.core.management.base import CommandError -from github_ops.git_ops import clone_repo, prepare_repo_for_pull, pull, push as git_push +from github_ops.git_ops import ( + clone_repo, + prepare_repo_for_pull, + pull, + push as git_push, +) from github_ops.tokens import get_github_token from operations.md_ops.github_export import detect_stale_titled_paths diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index 1b0d7f9..fda9f85 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -26,6 +26,26 @@ def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | N return max(current, incoming) +def _merge_issue_item_fields( + existing: ClangGithubIssueItem | None, + is_pull_request: bool, + github_created_at: datetime | None, + github_updated_at: datetime | None, +) -> tuple[bool, datetime | None, datetime | None]: + """Merge incoming issue/PR fields with a stored row (None / older incoming must not weaken state).""" + if existing is None: + return (is_pull_request, github_created_at, github_updated_at) + return ( + existing.is_pull_request or is_pull_request, + ( + github_created_at + if github_created_at is not None + else existing.github_created_at + ), + _max_dt(existing.github_updated_at, github_updated_at), + ) + + def upsert_issue_item( number: int, *, @@ -34,12 +54,19 @@ def upsert_issue_item( github_updated_at: datetime | None, ) -> tuple[ClangGithubIssueItem, bool]: """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" + existing = ClangGithubIssueItem.objects.filter(number=number).first() + is_pr, gc, gu = _merge_issue_item_fields( + existing, + is_pull_request, + github_created_at, + github_updated_at, + ) obj, created = ClangGithubIssueItem.objects.update_or_create( number=number, defaults={ - "is_pull_request": is_pull_request, - "github_created_at": github_created_at, - "github_updated_at": github_updated_at, + "is_pull_request": is_pr, + "github_created_at": gc, + "github_updated_at": gu, }, ) logger.debug( @@ -60,9 +87,14 @@ def upsert_commit( sha_clean = (sha or "").strip() if len(sha_clean) != 40: raise ValueError(f"commit sha must be 40 hex chars, got {sha_clean!r}") + existing = ClangGithubCommit.objects.filter(sha=sha_clean).first() + merged_committed_at = _max_dt( + existing.github_committed_at if existing else None, + github_committed_at, + ) obj, created = ClangGithubCommit.objects.update_or_create( sha=sha_clean, - defaults={"github_committed_at": github_committed_at}, + defaults={"github_committed_at": merged_committed_at}, ) logger.debug( "clang commit %s %s", @@ -79,12 +111,20 @@ def _flush_commits_chunk( if not pairs: return 0, 0 shas = [s for s, _ in pairs] - existing = set( - ClangGithubCommit.objects.filter(sha__in=shas).values_list("sha", flat=True) - ) + existing_committed = { + row.sha: row.github_committed_at + for row in ClangGithubCommit.objects.filter(sha__in=shas).only( + "sha", "github_committed_at" + ) + } + existing = set(existing_committed.keys()) now = timezone.now() objs = [ - ClangGithubCommit(sha=s, github_committed_at=dt, updated_at=now) + ClangGithubCommit( + sha=s, + github_committed_at=_max_dt(existing_committed.get(s), dt), + updated_at=now, + ) for s, dt in pairs ] ClangGithubCommit.objects.bulk_create( @@ -131,22 +171,31 @@ def _flush_issue_items_chunk( if not rows: return 0, 0 nums = [n for n, _, _, _ in rows] - existing = set( - ClangGithubIssueItem.objects.filter(number__in=nums).values_list( - "number", flat=True + existing_by_num = { + obj.number: obj + for obj in ClangGithubIssueItem.objects.filter(number__in=nums).only( + "number", + "is_pull_request", + "github_created_at", + "github_updated_at", ) - ) + } + existing = set(existing_by_num.keys()) now = timezone.now() - objs = [ - ClangGithubIssueItem( - number=n, - is_pull_request=is_pr, - github_created_at=gc, - github_updated_at=gu, - updated_at=now, + objs = [] + for n, is_pr, gc, gu in rows: + m_is_pr, m_gc, m_gu = _merge_issue_item_fields( + existing_by_num.get(n), is_pr, gc, gu + ) + objs.append( + ClangGithubIssueItem( + number=n, + is_pull_request=m_is_pr, + github_created_at=m_gc, + github_updated_at=m_gu, + updated_at=now, + ) ) - for n, is_pr, gc, gu in rows - ] ClangGithubIssueItem.objects.bulk_create( objs, batch_size=len(objs), @@ -172,8 +221,9 @@ def upsert_issue_items_batch( """Batch upsert issue/PR rows by ``number``. Duplicate ``number`` values merge: ``github_updated_at`` uses the latest - timestamp; ``github_created_at`` keeps the first non-None; ``is_pull_request`` - is True if any row marks the number as a PR. + timestamp; ``github_created_at`` uses a later row’s value when non-None, + otherwise keeps the prior value; ``is_pull_request`` is True if any row + marks the number as a PR. Returns: (inserted, updated) counts across all batches. @@ -189,7 +239,7 @@ def upsert_issue_items_batch( prev_is_pr, prev_gc, prev_gu = prev merged[num] = ( prev_is_pr or is_pr, - prev_gc if prev_gc is not None else gc, + gc if gc is not None else prev_gc, _max_dt(prev_gu, gu), ) inserted = updated = 0 diff --git a/clang_github_tracker/tests/test_preprocessors.py b/clang_github_tracker/tests/test_preprocessors.py index eba6716..8009237 100644 --- a/clang_github_tracker/tests/test_preprocessors.py +++ b/clang_github_tracker/tests/test_preprocessors.py @@ -22,6 +22,8 @@ def test_issue_preprocessor_db_and_failed_ids(mock_build, tmp_path, settings): p10 = tmp_path / "10.json" p10.write_text("{}", encoding="utf-8") + p99 = tmp_path / "99.json" + p99.write_text("{}", encoding="utf-8") ClangGithubIssueItem.objects.create( number=10, @@ -31,7 +33,7 @@ def test_issue_preprocessor_db_and_failed_ids(mock_build, tmp_path, settings): final = timezone.now() - timedelta(hours=1) def _issue_path(_owner, _repo, n): - return p10 if n == 10 else tmp_path / f"missing_{n}.json" + return {10: p10, 99: p99}.get(n, tmp_path / f"missing_{n}.json") with patch( "clang_github_tracker.preprocessors.issue_preprocessor.get_raw_source_issue_path", @@ -41,8 +43,9 @@ def _issue_path(_owner, _repo, n): ["llvm-project:issue:99"], final ) assert chunked is False - assert mock_build.call_count == 1 - assert len(docs) == 1 + # DB watermark picks #10; failed_ids must parse llvm-project:issue:99 and add #99. + assert mock_build.call_count == 2 + assert len(docs) == 2 @pytest.mark.django_db diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py index d79ce97..8abb626 100644 --- a/clang_github_tracker/tests/test_services.py +++ b/clang_github_tracker/tests/test_services.py @@ -114,3 +114,73 @@ def test_upsert_issue_items_batch_duplicate_merges_is_pr_or(): t0 = timezone.now() - timedelta(days=1) clang_services.upsert_issue_items_batch([(8, False, t0, t0), (8, True, t0, t0)]) assert ClangGithubIssueItem.objects.get(number=8).is_pull_request is True + + +@pytest.mark.django_db +def test_upsert_issue_item_merge_keeps_pr_and_timestamps_when_incoming_partial(): + t_created = timezone.now() - timedelta(days=10) + t_updated = timezone.now() - timedelta(days=3) + clang_services.upsert_issue_item( + 99, + is_pull_request=True, + github_created_at=t_created, + github_updated_at=t_updated, + ) + clang_services.upsert_issue_item( + 99, + is_pull_request=False, + github_created_at=None, + github_updated_at=None, + ) + row = ClangGithubIssueItem.objects.get(number=99) + assert row.is_pull_request is True + assert row.github_created_at == t_created + assert row.github_updated_at == t_updated + + +@pytest.mark.django_db +def test_upsert_issue_item_merge_github_updated_at_max(): + t_old = timezone.now() - timedelta(days=5) + t_new = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_item( + 100, + is_pull_request=False, + github_created_at=t_old, + github_updated_at=t_new, + ) + clang_services.upsert_issue_item( + 100, + is_pull_request=False, + github_created_at=None, + github_updated_at=t_old, + ) + assert ClangGithubIssueItem.objects.get(number=100).github_updated_at == t_new + + +@pytest.mark.django_db +def test_upsert_commit_merge_preserves_committed_at_when_incoming_none(): + sha = "e" * 40 + t0 = timezone.now() - timedelta(hours=2) + clang_services.upsert_commit(sha, github_committed_at=t0) + clang_services.upsert_commit(sha, github_committed_at=None) + assert ClangGithubCommit.objects.get(sha=sha).github_committed_at == t0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_merge_with_db_preserves_updated_when_incoming_none(): + t0 = timezone.now() - timedelta(days=2) + t1 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(20, False, t0, t1)]) + clang_services.upsert_issue_items_batch([(20, False, None, None)]) + row = ClangGithubIssueItem.objects.get(number=20) + assert row.github_created_at == t0 + assert row.github_updated_at == t1 + assert row.is_pull_request is False + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_merge_with_db_keeps_pr_once_true(): + t0 = timezone.now() - timedelta(days=1) + clang_services.upsert_issue_items_batch([(21, True, t0, t0)]) + clang_services.upsert_issue_items_batch([(21, False, t0, t0)]) + assert ClangGithubIssueItem.objects.get(number=21).is_pull_request is True diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 7b47ee6..00bf524 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -92,7 +92,7 @@ def test_resolve_invalid_range_clears_bounds(caplog): @pytest.mark.django_db def test_resolve_since_floor_without_until(): - """Only since: starts are max(DB+1s, since).""" + """Only since: both starts equal the explicit since; DB watermarks are ignored.""" base = timezone.now() - timedelta(days=30) ClangGithubIssueItem.objects.create( number=2, @@ -100,6 +100,7 @@ def test_resolve_since_floor_without_until(): github_updated_at=base, ) since = timezone.now() - timedelta(days=1) - _sc, si, _end = clang_state.resolve_start_end_dates(since, None) - assert si is not None - assert si >= since + sc, si, end = clang_state.resolve_start_end_dates(since, None) + assert sc == since + assert si == since + assert end is None diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 911d7ca..93e828e 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -234,12 +234,14 @@ def clone_repo( e.returncode, safe_err_tail, ) - # Never re-raise with the real cmd: it embeds the token in the clone URL. + # Never re-raise with the real cmd or raw output: they may embed the token. safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] if depth is not None: safe_cmd.extend(["--depth", str(depth)]) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") raise subprocess.CalledProcessError( - e.returncode, safe_cmd, e.stdout, e.stderr + e.returncode, safe_cmd, safe_stdout, safe_stderr ) from None From 6922fd5b89a93236b826409d9b106ce4f95f2793 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 14:39:24 -0400 Subject: [PATCH 69/76] fix: sanitize git_ops remote errors; clang publisher stale-title cleanup uses export tree - #136 --- Dockerfile | 2 +- clang_github_tracker/publisher.py | 40 ++++-- clang_github_tracker/state_manager.py | 2 +- clang_github_tracker/tests/test_publisher.py | 41 +++++- github_ops/git_ops.py | 125 +++++++++++++++---- github_ops/tests/test_git_ops.py | 88 +++++++++++++ operations/md_ops/github_export.py | 7 +- operations/tests/test_github_export.py | 4 +- 8 files changed, 275 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index a2c10b4..8a26be9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,7 @@ RUN useradd --create-home appuser && chown -R appuser /app # Git 2.35+ blocks repos when directory owner != current user; bind mounts often # disagree (e.g. Docker Desktop on Windows). System config applies to root and appuser # (e.g. docker exec as root vs gosu appuser in entrypoint). -RUN git config --system --add safe.directory '*' +RUN git config --system --add safe.directory '/app/workspace/*' ENTRYPOINT ["/app/docker-entrypoint.sh"] # Container starts as root so entrypoint can chown; CMD runs as appuser via gosu diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index 965c92a..82aabbb 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -58,6 +58,22 @@ def _reset_hard_to_upstream(clone_dir: Path, remote: str, branch: str) -> None: raise CommandError(f"Could not reset clone to {ref}: {err}") from e +def _md_repo_rel_map(md_output_dir: Path) -> dict[str, str]: + """Map repo-relative posix path → absolute path for each .md under md_output_dir.""" + md_output_dir = md_output_dir.resolve() + out: dict[str, str] = {} + for path in md_output_dir.rglob("*"): + if not path.is_file(): + continue + if ".git" in path.relative_to(md_output_dir).parts: + continue + if path.suffix.lower() != ".md": + continue + rel = path.relative_to(md_output_dir).as_posix() + out[rel] = str(path.resolve()) + return out + + def _copy_md_tree(md_output_dir: Path, clone_dir: Path) -> None: """Copy all files under md_output_dir into clone_dir (preserve relative paths).""" md_output_dir = md_output_dir.resolve() @@ -85,6 +101,10 @@ def publish_clang_markdown( align to origin/<branch>, remove stale titled .md in md_export and clone, overlay md_export into the clone, commit and push. + Stale paths under ``md_output_dir`` use ``new_files`` (this run's writes). Stale + paths in the clone are detected using all ``.md`` files currently on disk under + ``md_output_dir`` so the clone matches the export tree. + Uses get_github_token(use=\"write\") and settings GIT_AUTHOR_* for the commit. """ owner = _validate_github_slug("owner", owner) @@ -186,15 +206,21 @@ def publish_clang_markdown( _reset_hard_to_upstream(clone_dir, "origin", branch) stale_md = detect_stale_titled_paths(md_output_dir, new_files) - stale_clone = detect_stale_titled_paths(clone_dir, new_files) - all_stale = sorted(set(stale_md) | set(stale_clone)) - for rel in all_stale: - for base in (md_output_dir, clone_dir): - p = base / rel - if p.is_file(): - p.unlink() + for rel in stale_md: + p = md_output_dir / rel + if p.is_file(): + p.unlink() + md_repo_rel_map = _md_repo_rel_map(md_output_dir) + stale_clone = detect_stale_titled_paths(clone_dir, md_repo_rel_map) + + for rel in stale_clone: + p = clone_dir / rel + if p.is_file(): + p.unlink() + + all_stale = sorted(set(stale_md) | set(stale_clone)) if all_stale: logger.info( "clang_github_tracker publish: removed %s stale titled file(s).", diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index a12e352..318a2d5 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -65,7 +65,7 @@ def resolve_start_end_dates( - **Starts:** If ``since`` is set (without a valid closed window): ``start_commit`` and ``start_item`` are both ``since``. If ``since`` is not set: both are - ``Max(github_* timestamp) + 1 second`` from the DB when a watermark exists, else + ``Max(github_* timestamp) + 1 millisecond`` from the DB when a watermark exists, else ``None`` (full history). Watermarks use ``Max(github_committed_at)`` and ``Max(github_updated_at)`` on ``ClangGithubCommit`` / ``ClangGithubIssueItem``. """ diff --git a/clang_github_tracker/tests/test_publisher.py b/clang_github_tracker/tests/test_publisher.py index 946d232..a4856a9 100644 --- a/clang_github_tracker/tests/test_publisher.py +++ b/clang_github_tracker/tests/test_publisher.py @@ -52,7 +52,6 @@ def test_publish_clang_markdown_success_copies_and_pushes( f = sub / "#1 - Title.md" f.write_text("body", encoding="utf-8") new_files = {"issues/2024/2024-01/#1 - Title.md": str(f)} - with _author_settings(raw): publish_clang_markdown(md, "acme", "priv", "main", new_files) @@ -65,6 +64,46 @@ def test_publish_clang_markdown_success_copies_and_pushes( assert kwargs["commit_message"] == "chore: update Clang issues/PRs markdown" +@pytest.mark.django_db +@patch("clang_github_tracker.publisher.git_push") +@patch("clang_github_tracker.publisher._reset_hard_to_upstream") +@patch("clang_github_tracker.publisher.pull") +@patch("clang_github_tracker.publisher.prepare_repo_for_pull") +@patch("clang_github_tracker.publisher.get_github_token", return_value="tok") +def test_publish_clang_markdown_stale_title_cleanup_md_then_clone( + _token, + _prepare, + _pull, + _reset, + mock_push, + raw_and_md, +): + """Stale titled .md is removed from md_export (via new_files), then clone uses post-cleanup disk map.""" + raw, md, clone_root = raw_and_md + sub = md / "issues" / "2024" / "2024-01" + sub.mkdir(parents=True) + new_path = sub / "#1 - New title.md" + old_path = sub / "#1 - Old title.md" + new_path.write_text("new", encoding="utf-8") + old_path.write_text("old", encoding="utf-8") + + clone_sub = clone_root / "issues" / "2024" / "2024-01" + clone_sub.mkdir(parents=True) + (clone_sub / "#1 - Old title.md").write_text("stale on clone", encoding="utf-8") + + new_files = {"issues/2024/2024-01/#1 - New title.md": str(new_path)} + with _author_settings(raw): + publish_clang_markdown(md, "acme", "priv", "main", new_files) + + assert not old_path.is_file() + assert new_path.is_file() + copied = clone_sub / "#1 - New title.md" + assert copied.is_file() + assert copied.read_text(encoding="utf-8") == "new" + assert not (clone_sub / "#1 - Old title.md").is_file() + mock_push.assert_called_once() + + @pytest.mark.django_db @patch("clang_github_tracker.publisher.git_push") @patch("clang_github_tracker.publisher._reset_hard_to_upstream") diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 93e828e..5ec388f 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -329,6 +329,9 @@ def push( if branch: cmd.append(branch) logger.info("Pushing %s %s", repo_dir, branch or "(current)") + safe_push_cmd = ["git", "-C", str(repo_dir), "push", remote_url] + if branch: + safe_push_cmd.append(branch) try: subprocess.run( cmd, @@ -339,16 +342,32 @@ def push( errors="replace", timeout=GIT_CMD_TIMEOUT_SECONDS, ) - except subprocess.TimeoutExpired: + except subprocess.TimeoutExpired as e: logger.warning( "git push timed out after %ss (%s)", GIT_CMD_TIMEOUT_SECONDS, repo_dir, ) - raise + raise subprocess.TimeoutExpired( + safe_push_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None except subprocess.CalledProcessError as e: - logger.warning("git push failed (%s), returncode=%s", repo_dir, e.returncode) - raise + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git push failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_push_cmd, safe_stdout, safe_stderr + ) from None def pull( @@ -384,7 +403,32 @@ def pull( if branch: cmd.append(branch) logger.info("Pulling %s %s", repo_dir, branch or "(current)") - subprocess.run(cmd, check=True, capture_output=True, text=True) + safe_pull_cmd = ["git", "-C", str(repo_dir), "pull", remote_url] + if branch: + safe_pull_cmd.append(branch) + try: + subprocess.run( + cmd, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git pull failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_pull_cmd, safe_stdout, safe_stderr + ) from None def prepare_repo_for_pull( @@ -412,23 +456,60 @@ def prepare_repo_for_pull( auth_url = _url_with_token(remote_url, token or "") logger.info("Fetching %s refs (prune) in %s", remote, repo_dir) - subprocess.run( - [ - "git", - "-C", - str(repo_dir), - "fetch", - auth_url, - f"+refs/heads/*:refs/remotes/{remote}/*", - "--prune", - ], - check=True, - capture_output=True, - text=True, - encoding="utf-8", - errors="replace", - timeout=GIT_CMD_TIMEOUT_SECONDS, - ) + fetch_cmd = [ + "git", + "-C", + str(repo_dir), + "fetch", + auth_url, + f"+refs/heads/*:refs/remotes/{remote}/*", + "--prune", + ] + safe_fetch_cmd = [ + "git", + "-C", + str(repo_dir), + "fetch", + remote_url, + f"+refs/heads/*:refs/remotes/{remote}/*", + "--prune", + ] + try: + subprocess.run( + fetch_cmd, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=GIT_CMD_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + "git fetch timed out after %ss (%s)", + GIT_CMD_TIMEOUT_SECONDS, + repo_dir, + ) + raise subprocess.TimeoutExpired( + safe_fetch_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None + except subprocess.CalledProcessError as e: + err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] + safe_err_tail = sanitize_git_output(err_tail) + logger.warning( + "git fetch failed (%s), returncode=%s, stderr/stdout_tail=%r", + repo_dir, + e.returncode, + safe_err_tail, + ) + safe_stdout = sanitize_git_output(e.stdout or "") + safe_stderr = sanitize_git_output(e.stderr or "") + raise subprocess.CalledProcessError( + e.returncode, safe_fetch_cmd, safe_stdout, safe_stderr + ) from None logger.info("Running git clean -fd in %s", repo_dir) subprocess.run( ["git", "-C", str(repo_dir), "clean", "-fd"], diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index eded244..62d636c 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -1,7 +1,9 @@ """Tests for github_ops git_ops (clone, push, pull, fetch_file_content, upload_folder_to_github).""" +import subprocess from unittest.mock import MagicMock, patch +import pytest import requests from github_ops.git_ops import ( @@ -11,6 +13,7 @@ fetch_file_content, pull, get_commit_file_changes, + prepare_repo_for_pull, push, sanitize_git_output, upload_folder_to_github, @@ -277,6 +280,91 @@ def test_push_commit_failure_without_nothing_to_commit_raises(tmp_path): assert False, "push should have raised on commit failure" +def test_push_failure_redacts_token_from_reraised_exception_cmd(tmp_path): + """git push failure re-raises CalledProcessError whose cmd uses the token-free remote URL.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(returncode=0, stdout="", stderr=""), + MagicMock(returncode=0, stdout="", stderr=""), + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "push", + "https://x-access-token:SECRET@github.com/o/r.git", + "main", + ], + "", + "rejected", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + push(tmp_path, "origin", branch="main", token="SECRET") + err = excinfo.value + cmd_str = " ".join(err.cmd) + assert "SECRET" not in cmd_str + assert remote in cmd_str + assert err.stderr == "rejected" + + +def test_pull_failure_redacts_token_from_reraised_exception_cmd(tmp_path): + """git pull failure re-raises CalledProcessError whose cmd uses the token-free remote URL.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "pull", + "https://x-access-token:XY@github.com/o/r.git", + ], + "", + "error", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + pull(tmp_path, token="XY") + assert "XY" not in " ".join(excinfo.value.cmd) + assert remote in " ".join(excinfo.value.cmd) + + +def test_prepare_repo_fetch_failure_redacts_token_from_reraised_exception_cmd( + tmp_path, +): + """prepare_repo_for_pull fetch failure re-raises with cmd without embedded PAT.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.CalledProcessError( + 1, + [ + "git", + "-C", + str(tmp_path), + "fetch", + "https://x-access-token:PAT@github.com/o/r.git", + "+refs/heads/*:refs/remotes/origin/*", + "--prune", + ], + "", + "fetch failed", + ), + ] + with pytest.raises(subprocess.CalledProcessError) as excinfo: + prepare_repo_for_pull(tmp_path, remote="origin", token="PAT") + assert "PAT" not in " ".join(excinfo.value.cmd) + assert remote in excinfo.value.cmd[4] + + # --- pull --- diff --git a/operations/md_ops/github_export.py b/operations/md_ops/github_export.py index e4ee905..3abf105 100644 --- a/operations/md_ops/github_export.py +++ b/operations/md_ops/github_export.py @@ -157,7 +157,12 @@ def write_md_files( created_at = _parse_dt(created_at_raw) out_path = _md_path( - output_dir, folder_prefix, "pull_requests", created_at, number, title + output_dir, + folder_prefix, + "pull_requests", + created_at, + number, + title, ) try: md_content = pr_json_to_md(pr_data) diff --git a/operations/tests/test_github_export.py b/operations/tests/test_github_export.py index c483dc1..4679d1a 100644 --- a/operations/tests/test_github_export.py +++ b/operations/tests/test_github_export.py @@ -40,7 +40,9 @@ def test_detect_renames_from_dirs_no_conflict(mock_list_remote: MagicMock): @patch("operations.md_ops.github_export.list_remote_directory") -def test_detect_renames_from_dirs_non_numbered_md_ignored(mock_list_remote: MagicMock): +def test_detect_renames_from_dirs_non_numbered_md_ignored( + mock_list_remote: MagicMock, +): """Files not matching #n - prefix are ignored.""" mock_list_remote.return_value = ["issues/2024/2024-03/README.md"] new_files = {"issues/2024/2024-03/#5 - T.md": "/tmp/x"} From f8c1880764b7cb862d4eeaceafbec163cdde0755 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 15:27:29 -0400 Subject: [PATCH 70/76] docs: fix ClangGithubIssueItem watermark (+1ms); format services.py - #136 --- clang_github_tracker/services.py | 2 +- docs/Schema.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index fda9f85..712437b 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -221,7 +221,7 @@ def upsert_issue_items_batch( """Batch upsert issue/PR rows by ``number``. Duplicate ``number`` values merge: ``github_updated_at`` uses the latest - timestamp; ``github_created_at`` uses a later row’s value when non-None, + timestamp; ``github_created_at`` uses a later row's value when non-None, otherwise keeps the prior value; ``is_pull_request`` is True if any row marks the number as a PR. diff --git a/docs/Schema.md b/docs/Schema.md index 77c91bd..ec3ac25 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -361,7 +361,7 @@ Standalone tables for the **llvm/llvm-project** (or `CLANG_GITHUB_OWNER` / `CLAN | Model | Purpose | | ----- | ------- | -| **ClangGithubIssueItem** | One row per issue or PR **number** (`unique`). `is_pull_request` distinguishes types. `github_created_at` / `github_updated_at` mirror GitHub API times; **`github_updated_at`** (with `Max` + 1s) drives **API fetch** resume. Django **`updated_at`** (`auto_now`) bumps on every upsert and drives **Pinecone** incrementality vs `PineconeSyncStatus.final_sync_at`. | +| **ClangGithubIssueItem** | One row per issue or PR **number** (`unique`). `is_pull_request` distinguishes types. `github_created_at` / `github_updated_at` mirror GitHub API times; **`github_updated_at`** (with `Max` + 1ms) drives **API fetch** resume. Django **`updated_at`** (`auto_now`) bumps on every upsert and drives **Pinecone** incrementality vs `PineconeSyncStatus.final_sync_at`. | | **ClangGithubCommit** | One row per **sha** (`unique`, 40-char hex). `github_committed_at` is the author/committer date used for commit fetch watermarks. | Raw JSON remains under `workspace/raw/github_activity_tracker/<owner>/<repo>/` (same layout as other raw GitHub activity). From 10daaa3737b46fc61f1da74e24e9244f4ab27e10 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 16:41:59 -0400 Subject: [PATCH 71/76] fix(clang): issue number validation + lowercase SHAs; git_ops timeout redaction for clone/pull - #136 --- clang_github_tracker/services.py | 13 +++-- clang_github_tracker/tests/test_services.py | 59 +++++++++++++++++++-- github_ops/git_ops.py | 28 ++++++++-- github_ops/tests/test_git_ops.py | 50 +++++++++++++++++ 4 files changed, 141 insertions(+), 9 deletions(-) diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index 712437b..42411fb 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -17,6 +17,11 @@ DEFAULT_UPSERT_BATCH_SIZE = 500 +def _invalid_issue_number(n: object) -> bool: + """True if ``n`` is not a positive ``int`` (rejects ``bool`` — it subclasses ``int``).""" + return isinstance(n, bool) or not isinstance(n, int) or n <= 0 + + def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | None: """Return the later of two datetimes; ``None`` is treated as missing (never wins over a value).""" if current is None: @@ -54,6 +59,8 @@ def upsert_issue_item( github_updated_at: datetime | None, ) -> tuple[ClangGithubIssueItem, bool]: """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" + if _invalid_issue_number(number): + raise ValueError(f"issue number must be a positive integer, got {number!r}") existing = ClangGithubIssueItem.objects.filter(number=number).first() is_pr, gc, gu = _merge_issue_item_fields( existing, @@ -84,7 +91,7 @@ def upsert_commit( github_committed_at: datetime | None, ) -> tuple[ClangGithubCommit, bool]: """Create or update a ClangGithubCommit by ``sha``. Returns (instance, created).""" - sha_clean = (sha or "").strip() + sha_clean = (sha or "").strip().lower() if len(sha_clean) != 40: raise ValueError(f"commit sha must be 40 hex chars, got {sha_clean!r}") existing = ClangGithubCommit.objects.filter(sha=sha_clean).first() @@ -151,7 +158,7 @@ def upsert_commits_batch( """ merged: dict[str, datetime | None] = {} for sha, dt in rows: - s = (sha or "").strip() + s = (sha or "").strip().lower() if len(s) != 40: continue merged[s] = _max_dt(merged.get(s), dt) @@ -230,7 +237,7 @@ def upsert_issue_items_batch( """ merged: dict[int, tuple[bool, datetime | None, datetime | None]] = {} for num, is_pr, gc, gu in rows: - if not isinstance(num, int) or num <= 0: + if _invalid_issue_number(num): continue prev = merged.get(num) if prev is None: diff --git a/clang_github_tracker/tests/test_services.py b/clang_github_tracker/tests/test_services.py index 8abb626..2e4120e 100644 --- a/clang_github_tracker/tests/test_services.py +++ b/clang_github_tracker/tests/test_services.py @@ -9,6 +9,34 @@ from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem +@pytest.mark.django_db +def test_upsert_issue_item_rejects_bool_and_non_positive(): + t0 = timezone.now() + with pytest.raises(ValueError, match="positive integer"): + clang_services.upsert_issue_item( + True, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + with pytest.raises(ValueError, match="positive integer"): + clang_services.upsert_issue_item( + 0, + is_pull_request=False, + github_created_at=t0, + github_updated_at=t0, + ) + assert ClangGithubIssueItem.objects.count() == 0 + + +@pytest.mark.django_db +def test_upsert_issue_items_batch_skips_bool_does_not_upsert_as_issue_one(): + t0 = timezone.now() + ins, _ = clang_services.upsert_issue_items_batch([(True, False, t0, t0)]) + assert ins == 0 + assert not ClangGithubIssueItem.objects.filter(number=1).exists() + + @pytest.mark.django_db def test_upsert_issue_item_create_and_update_bumps_updated_at(): t0 = timezone.now() - timedelta(days=2) @@ -31,8 +59,8 @@ def test_upsert_issue_item_create_and_update_bumps_updated_at(): ) assert created2 is False row.refresh_from_db() - assert row.updated_at > first_updated assert row.github_updated_at == t1 + assert row.updated_at >= first_updated @pytest.mark.django_db @@ -56,7 +84,7 @@ def test_upsert_commits_batch_create_and_update(): assert ins2 == 0 and upd2 == 1 row.refresh_from_db() assert row.github_committed_at == t1 - assert row.updated_at > first_updated + assert row.updated_at >= first_updated @pytest.mark.django_db @@ -73,7 +101,32 @@ def test_upsert_issue_items_batch_create_and_update(): assert ins2 == 0 and upd2 == 1 row.refresh_from_db() assert row.github_updated_at == t1 - assert row.updated_at > first_updated + assert row.updated_at >= first_updated + + +@pytest.mark.django_db +def test_upsert_commits_batch_dedupes_sha_by_case(): + """Uppercase and lowercase hex refer to the same commit; merge timestamps in one row.""" + sha_lower = "abcdef" + "0" * 34 + sha_upper = "ABCDEF" + "0" * 34 + t_new = timezone.now() + t_old = t_new - timedelta(days=7) + ins, _ = clang_services.upsert_commits_batch( + [(sha_upper, t_old), (sha_lower, t_new)] + ) + assert ins == 1 + assert ClangGithubCommit.objects.count() == 1 + row = ClangGithubCommit.objects.get(sha=sha_lower) + assert row.github_committed_at == t_new + + +@pytest.mark.django_db +def test_upsert_commit_canonicalizes_sha_to_lowercase(): + sha_mixed = "AbCdEf" + "0" * 34 + t0 = timezone.now() + clang_services.upsert_commit(sha_mixed, github_committed_at=t0) + row = ClangGithubCommit.objects.get(sha=sha_mixed.lower()) + assert row.github_committed_at == t0 @pytest.mark.django_db diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 5ec388f..4d9ff0b 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -216,14 +216,23 @@ def clone_repo( errors="replace", timeout=GIT_CMD_TIMEOUT_SECONDS, ) - except subprocess.TimeoutExpired: + except subprocess.TimeoutExpired as e: + safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] + if depth is not None: + safe_cmd.extend(["--depth", str(depth)]) + safe_url_or_slug = sanitize_git_output(url_or_slug) logger.warning( "git clone timed out after %ss (%s -> %s)", GIT_CMD_TIMEOUT_SECONDS, - url_or_slug, + safe_url_or_slug, dest_dir, ) - raise + raise subprocess.TimeoutExpired( + safe_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None except subprocess.CalledProcessError as e: err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] safe_err_tail = sanitize_git_output(err_tail) @@ -414,7 +423,20 @@ def pull( text=True, encoding="utf-8", errors="replace", + timeout=GIT_CMD_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + "git pull timed out after %ss (%s)", + GIT_CMD_TIMEOUT_SECONDS, + repo_dir, ) + raise subprocess.TimeoutExpired( + safe_pull_cmd, + e.timeout, + output=None if e.output is None else sanitize_git_output(e.output), + stderr=None if e.stderr is None else sanitize_git_output(e.stderr), + ) from None except subprocess.CalledProcessError as e: err_tail = ((e.stderr or "") + (e.stdout or ""))[-500:] safe_err_tail = sanitize_git_output(err_tail) diff --git a/github_ops/tests/test_git_ops.py b/github_ops/tests/test_git_ops.py index 62d636c..83da5b8 100644 --- a/github_ops/tests/test_git_ops.py +++ b/github_ops/tests/test_git_ops.py @@ -7,6 +7,7 @@ import requests from github_ops.git_ops import ( + GIT_CMD_TIMEOUT_SECONDS, _create_blob_with_retry, _url_with_token, clone_repo, @@ -120,6 +121,27 @@ def test_clone_repo_uses_get_github_token_when_token_not_provided(tmp_path): get_token.assert_called_once_with(use="scraping") +def test_clone_repo_timeout_redacts_token_from_reraised_exception_cmd(tmp_path): + """clone timeout re-raises TimeoutExpired whose cmd omits the PAT (matches real clone cmd).""" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = subprocess.TimeoutExpired( + [ + "git", + "clone", + "https://x-access-token:LEAK@github.com/o/r.git", + str(tmp_path), + ], + 300, + output="", + stderr="", + ) + with pytest.raises(subprocess.TimeoutExpired) as excinfo: + clone_repo("https://github.com/o/r.git", tmp_path, token="LEAK") + assert "LEAK" not in " ".join(excinfo.value.cmd) + assert "https://github.com/o/r.git" in excinfo.value.cmd[2] + assert excinfo.value.timeout == 300 + + # --- push --- @@ -336,6 +358,32 @@ def test_pull_failure_redacts_token_from_reraised_exception_cmd(tmp_path): assert remote in " ".join(excinfo.value.cmd) +def test_pull_timeout_redacts_token_from_reraised_exception_cmd(tmp_path): + """git pull timeout re-raises TimeoutExpired whose cmd omits the PAT.""" + remote = "https://github.com/o/r.git" + with patch("github_ops.git_ops.subprocess.run") as run_mock: + run_mock.side_effect = [ + MagicMock(stdout=f"{remote}\n", stderr=""), + subprocess.TimeoutExpired( + [ + "git", + "-C", + str(tmp_path), + "pull", + "https://x-access-token:XY@github.com/o/r.git", + ], + GIT_CMD_TIMEOUT_SECONDS, + output="", + stderr="", + ), + ] + with pytest.raises(subprocess.TimeoutExpired) as excinfo: + pull(tmp_path, token="XY") + assert "XY" not in " ".join(excinfo.value.cmd) + assert remote in " ".join(excinfo.value.cmd) + assert excinfo.value.timeout == GIT_CMD_TIMEOUT_SECONDS + + def test_prepare_repo_fetch_failure_redacts_token_from_reraised_exception_cmd( tmp_path, ): @@ -383,6 +431,7 @@ def test_pull_with_branch_runs_checkout_then_pull(tmp_path): assert calls[0][-1] == "main" assert "pull" in calls[2] assert "main" in calls[2] + assert run_mock.call_args_list[2][1].get("timeout") == GIT_CMD_TIMEOUT_SECONDS def test_pull_without_branch_does_not_run_checkout(tmp_path): @@ -396,6 +445,7 @@ def test_pull_without_branch_does_not_run_checkout(tmp_path): calls = [c[0][0] for c in run_mock.call_args_list] checkout_calls = [c for c in calls if "checkout" in c] assert len(checkout_calls) == 0 + assert run_mock.call_args_list[-1][1].get("timeout") == GIT_CMD_TIMEOUT_SECONDS def test_pull_uses_get_github_token_when_token_not_provided(tmp_path): From 9464188bd3a5492e3592d131f184b9957b78aff3 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Fri, 3 Apr 2026 17:31:15 -0400 Subject: [PATCH 72/76] fix: clang backfill raw-only; sanitize clone/publish git errors; sync_raw issue-number guards - #136 --- .../commands/backfill_clang_github_tracker.py | 117 ++---------------- clang_github_tracker/publisher.py | 29 +++-- clang_github_tracker/sync_raw.py | 13 +- clang_github_tracker/tests/test_backfill.py | 34 +---- .../tests/test_state_manager.py | 4 +- clang_github_tracker/workspace.py | 19 +-- docs/Workspace.md | 1 - github_ops/git_ops.py | 10 +- 8 files changed, 45 insertions(+), 182 deletions(-) diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py index bce2f32..2379dea 100644 --- a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -1,26 +1,19 @@ """ -Backfill ClangGithubIssueItem / ClangGithubCommit from CSV or raw JSON scan. +Backfill ClangGithubIssueItem / ClangGithubCommit from raw JSON scan. """ from __future__ import annotations -import csv import json import logging import re from datetime import datetime -from pathlib import Path from django.core.management.base import BaseCommand, CommandError from clang_github_tracker import services as clang_services from clang_github_tracker import state_manager as clang_state -from clang_github_tracker.workspace import ( - OWNER, - REPO, - default_backfill_csv_path, - get_raw_repo_dir, -) +from clang_github_tracker.workspace import OWNER, REPO, get_raw_repo_dir from github_activity_tracker.sync.utils import ( normalize_issue_json, normalize_pr_json, @@ -44,112 +37,16 @@ def _commit_date_from_json(data: dict): class Command(BaseCommand): - """Load ``ClangGithubIssueItem`` / ``ClangGithubCommit`` from CSV or raw JSON.""" + """Load ``ClangGithubIssueItem`` / ``ClangGithubCommit`` from raw JSON dirs.""" help = ( - "Backfill clang_github_tracker DB from CSV (--from-csv) or raw JSON dirs (--from-raw). " - "CSV columns: record_type (issue|pr|commit), number, github_created_at, github_updated_at, " - "sha, github_committed_at." + "Backfill clang_github_tracker DB by scanning " + "raw/github_activity_tracker/<owner>/<repo>/commits|issues|prs/*.json" ) - def add_arguments(self, parser): - """Add mutually exclusive ``--from-csv`` and ``--from-raw`` options.""" - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "--from-csv", - nargs="?", - const="", - default=None, - metavar="PATH", - help=( - "Import from CSV. If PATH is omitted, use workspace/clang_github_tracker/" - "clang_github_tracker_backfill.csv" - ), - ) - group.add_argument( - "--from-raw", - action="store_true", - help="Scan raw/github_activity_tracker/<owner>/<repo>/commits|issues|prs/*.json", - ) - def handle(self, *args, **options): - """Dispatch to CSV or raw-directory backfill.""" - if options.get("from_raw"): - self._backfill_from_raw() - return - csv_arg = options.get("from_csv") - path = Path(csv_arg) if csv_arg else default_backfill_csv_path() - self._backfill_from_csv(path) - - def _backfill_from_csv(self, path: Path) -> None: - """Parse CSV at ``path`` and batch-upsert issues, PRs, and commits.""" - if not path.is_file(): - raise CommandError(f"CSV not found: {path}") - commit_rows: list[tuple[str, datetime | None]] = [] - issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] - skipped = 0 - with path.open(encoding="utf-8", newline="") as f: - reader = csv.DictReader(f) - if not reader.fieldnames: - raise CommandError("CSV has no header row") - for row in reader: - rt = (row.get("record_type") or "").strip().lower() - try: - if rt == "issue": - num = int((row.get("number") or "").strip()) - if num <= 0: - logger.warning("skip issue row: invalid number %r", num) - skipped += 1 - continue - gc = parse_datetime( - (row.get("github_created_at") or "").strip() - ) - gu = parse_datetime( - (row.get("github_updated_at") or "").strip() - ) - issue_rows.append((num, False, gc, gu)) - elif rt == "pr": - num = int((row.get("number") or "").strip()) - if num <= 0: - logger.warning("skip pr row: invalid number %r", num) - skipped += 1 - continue - gc = parse_datetime( - (row.get("github_created_at") or "").strip() - ) - gu = parse_datetime( - (row.get("github_updated_at") or "").strip() - ) - issue_rows.append((num, True, gc, gu)) - elif rt == "commit": - sha = (row.get("sha") or "").strip() - if not _SHA40.match(sha): - logger.warning("skip commit row: invalid sha %r", sha) - skipped += 1 - continue - gcm = parse_datetime( - (row.get("github_committed_at") or "").strip() - ) - commit_rows.append((sha, gcm)) - else: - logger.warning("skip row: unknown record_type %r", rt) - skipped += 1 - except (TypeError, ValueError) as e: - logger.warning("skip row: %s (row=%r)", e, row) - skipped += 1 - - ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) - ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) - logger.info( - "CSV backfill done: issues_prs inserted=%s updated=%s commits inserted=%s " - "updated=%s skipped=%s path=%s", - ins_i, - upd_i, - ins_c, - upd_c, - skipped, - path, - ) + """Scan raw JSON under the configured repo and upsert DB rows.""" + self._backfill_from_raw() def _backfill_from_raw(self) -> None: """Scan ``commits`` / ``issues`` / ``prs`` JSON under the raw repo dir and upsert.""" diff --git a/clang_github_tracker/publisher.py b/clang_github_tracker/publisher.py index 82aabbb..36b35c5 100644 --- a/clang_github_tracker/publisher.py +++ b/clang_github_tracker/publisher.py @@ -16,6 +16,7 @@ prepare_repo_for_pull, pull, push as git_push, + sanitize_git_output, ) from github_ops.tokens import get_github_token from operations.md_ops.github_export import detect_stale_titled_paths @@ -25,6 +26,13 @@ _GITHUB_OWNER_REPO_SLUG = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?$") +def _redacted_git_subprocess_error(e: subprocess.CalledProcessError) -> str: + """Stderr/stdout or fallback ``str(e)``, redacted for logs and ``CommandError`` text.""" + tail = ((e.stderr or "") + (e.stdout or "")).strip() + text = tail if tail else str(e) + return sanitize_git_output(text) + + def _validate_github_slug(label: str, value: str) -> str: """Return stripped owner or repo name, or raise CommandError if unsafe or invalid.""" v = (value or "").strip() @@ -54,7 +62,7 @@ def _reset_hard_to_upstream(clone_dir: Path, remote: str, branch: str) -> None: errors="replace", ) except subprocess.CalledProcessError as e: - err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + err = _redacted_git_subprocess_error(e) raise CommandError(f"Could not reset clone to {ref}: {err}") from e @@ -132,7 +140,10 @@ def publish_clang_markdown( # Private CLANG_GITHUB_CONTEXT_* repos need a PAT that can read them (clone/pull) # and push; get_github_token("write") uses GITHUB_TOKEN_WRITE or GITHUB_TOKEN. - token = get_github_token(use="write") + try: + token = get_github_token(use="write") + except ValueError as e: + raise CommandError(str(e)) from e git_user_name = ( getattr(settings, "GIT_AUTHOR_NAME", None) or "" ).strip() or "unknown" @@ -157,7 +168,7 @@ def publish_clang_markdown( try: clone_repo(repo_slug, clone_dir, token=token) except subprocess.CalledProcessError as e: - tail = ((e.stderr or "") + (e.stdout or "")).strip() + msg = _redacted_git_subprocess_error(e) hint = ( "Clone already uses get_github_token(use='write') (GITHUB_TOKEN_WRITE " "or GITHUB_TOKEN). Verify CLANG_GITHUB_CONTEXT_REPO_OWNER / _NAME, " @@ -166,18 +177,16 @@ def publish_clang_markdown( "that repository. GitHub often returns 'not found' when the token " "lacks access." ) - logger.error( - "clang_github_tracker publish: git clone failed: %s", tail or e - ) + logger.error("clang_github_tracker publish: git clone failed: %s", msg) raise CommandError( - f"Git clone failed for {repo_slug}: {tail or e.returncode}. {hint}" + f"Git clone failed for {repo_slug}: {msg}. {hint}" ) from e logger.info("Bootstrapping clone before pull: fetch, clean, reset (%s)", clone_dir) try: prepare_repo_for_pull(clone_dir, remote="origin", token=token) except subprocess.CalledProcessError as e: - err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + err = _redacted_git_subprocess_error(e) logger.error( "clang_github_tracker publish: prepare clone for pull failed " "(clone_dir=%s, branch=%s): %s", @@ -192,7 +201,7 @@ def publish_clang_markdown( try: pull(clone_dir, branch=branch, token=token) except subprocess.CalledProcessError as e: - err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + err = _redacted_git_subprocess_error(e) logger.error( "clang_github_tracker publish: git pull failed (clone_dir=%s, branch=%s): %s", clone_dir, @@ -240,7 +249,7 @@ def publish_clang_markdown( git_user_email=git_user_email, ) except subprocess.CalledProcessError as e: - err = ((e.stderr or "") + (e.stdout or "")).strip() or str(e) + err = _redacted_git_subprocess_error(e) logger.error("clang_github_tracker publish: git push failed: %s", err) raise CommandError(f"Git push failed: {err}") from e diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index 5f2dee0..d5a456a 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -40,6 +40,11 @@ def _ensure_utc(dt: datetime | None) -> datetime | None: return dt.astimezone(timezone.utc) +def _valid_positive_issue_number(n: object) -> bool: + """True for a positive issue/PR number; rejects ``bool`` (``type(n) is int``).""" + return type(n) is int and n > 0 + + def _commit_date(commit_data: dict) -> datetime | None: """Extract author/committer date from GitHub commit payload.""" commit = commit_data.get("commit") or {} @@ -104,32 +109,32 @@ def sync_clang_github_activity( pr_number = (item["pr_info"] or {}).get("number") if pr_number is not None: save_pr_raw_source(owner, repo, item) - pr_numbers.append(pr_number) flat = normalize_pr_json(item) num = flat.get("number") - if isinstance(num, int) and num > 0: + if _valid_positive_issue_number(num): clang_services.upsert_issue_item( num, is_pull_request=True, github_created_at=parse_datetime(flat.get("created_at")), github_updated_at=parse_datetime(flat.get("updated_at")), ) + pr_numbers.append(num) else: issue_number = (item.get("issue_info") or {}).get("number") or item.get( "number" ) if issue_number is not None: save_issue_raw_source(owner, repo, item) - issue_numbers.append(issue_number) flat = normalize_issue_json(item) num = flat.get("number") - if isinstance(num, int) and num > 0: + if _valid_positive_issue_number(num): clang_services.upsert_issue_item( num, is_pull_request=False, github_created_at=parse_datetime(flat.get("created_at")), github_updated_at=parse_datetime(flat.get("updated_at")), ) + issue_numbers.append(num) except (ConnectionException, RateLimitException) as e: logger.exception("clang_github_tracker sync failed: %s", e) diff --git a/clang_github_tracker/tests/test_backfill.py b/clang_github_tracker/tests/test_backfill.py index 7bfe9c3..d6167d2 100644 --- a/clang_github_tracker/tests/test_backfill.py +++ b/clang_github_tracker/tests/test_backfill.py @@ -9,38 +9,6 @@ from clang_github_tracker.workspace import OWNER, REPO -@pytest.mark.django_db -def test_backfill_csv(tmp_path): - csv_path = tmp_path / "b.csv" - csv_path.write_text( - "record_type,number,github_created_at,github_updated_at,sha,github_committed_at\n" - "issue,1,2024-01-01T00:00:00Z,2024-01-02T00:00:00Z,,\n" - "pr,2,,,,\n" - f"commit,,,,{'a' * 40},2024-03-01T00:00:00Z\n", - encoding="utf-8", - ) - call_command("backfill_clang_github_tracker", f"--from-csv={csv_path}") - assert ClangGithubIssueItem.objects.filter(number=1, is_pull_request=False).exists() - assert ClangGithubIssueItem.objects.filter(number=2, is_pull_request=True).exists() - assert ClangGithubCommit.objects.filter(sha="a" * 40).exists() - - -@pytest.mark.django_db -def test_backfill_csv_skips_non_positive_issue_pr_numbers(tmp_path): - csv_path = tmp_path / "bad_nums.csv" - csv_path.write_text( - "record_type,number,github_created_at,github_updated_at,sha,github_committed_at\n" - "issue,0,,,,\n" - "issue,-3,,,,\n" - "pr,-1,,,,\n" - "issue,5,2024-01-01T00:00:00Z,2024-01-02T00:00:00Z,,\n", - encoding="utf-8", - ) - call_command("backfill_clang_github_tracker", f"--from-csv={csv_path}") - assert ClangGithubIssueItem.objects.filter(number=5, is_pull_request=False).exists() - assert ClangGithubIssueItem.objects.count() == 1 - - @pytest.mark.django_db def test_backfill_from_raw(tmp_path, monkeypatch): root = tmp_path / "raw" / OWNER / REPO @@ -88,7 +56,7 @@ def test_backfill_from_raw(tmp_path, monkeypatch): "clang_github_tracker.management.commands.backfill_clang_github_tracker.get_raw_repo_dir", lambda *a, **k: root, ) - call_command("backfill_clang_github_tracker", "--from-raw") + call_command("backfill_clang_github_tracker") assert ClangGithubIssueItem.objects.filter(number=3, is_pull_request=False).exists() assert ClangGithubIssueItem.objects.filter(number=4, is_pull_request=True).exists() assert ClangGithubCommit.objects.filter(sha=sha).exists() diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 00bf524..20b8e51 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -87,7 +87,9 @@ def test_resolve_invalid_range_clears_bounds(caplog): sc, si, end = clang_state.resolve_start_end_dates(since, until) assert any("invalid date range" in r.getMessage() for r in caplog.records) assert end is None - assert sc is not None and si is not None + delta = timedelta(milliseconds=1) + assert sc == wm + delta + assert si == wm + delta @pytest.mark.django_db diff --git a/clang_github_tracker/workspace.py b/clang_github_tracker/workspace.py index c7f14ae..3e7eef8 100644 --- a/clang_github_tracker/workspace.py +++ b/clang_github_tracker/workspace.py @@ -1,11 +1,9 @@ """ -Workspace paths for clang_github_tracker: md export, backfill CSV dir, raw GitHub JSON. +Workspace paths for clang_github_tracker: md export, raw GitHub JSON. Layout: workspace/clang_github_activity/ - md_export/ (generated Markdown for GitHub publish) - workspace/clang_github_tracker/ - - clang_github_tracker_backfill.csv (default CSV backfill path) workspace/raw/github_activity_tracker/<owner>/<repo>/ - commits/, issues/, prs/ """ @@ -18,11 +16,8 @@ from config.workspace import get_workspace_path _APP_SLUG = "clang_github_activity" -_TRACKER_DATA_SLUG = "clang_github_tracker" _RAW_APP_SLUG = "github_activity_tracker" -DEFAULT_BACKFILL_CSV_NAME = "clang_github_tracker_backfill.csv" - # Repo we sync (raw only, no DB); from settings (env: CLANG_GITHUB_OWNER, CLANG_GITHUB_REPO) OWNER = settings.CLANG_GITHUB_OWNER REPO = settings.CLANG_GITHUB_REPO @@ -44,18 +39,6 @@ def get_workspace_root() -> Path: return get_workspace_path(_APP_SLUG) -def get_clang_github_tracker_data_dir() -> Path: - """Return workspace/clang_github_tracker/; creates dir if missing.""" - path = get_workspace_path(_TRACKER_DATA_SLUG) - path.mkdir(parents=True, exist_ok=True) - return path - - -def default_backfill_csv_path() -> Path: - """Default path for CSV backfill: workspace/clang_github_tracker/<DEFAULT_BACKFILL_CSV_NAME>.""" - return get_clang_github_tracker_data_dir() / DEFAULT_BACKFILL_CSV_NAME - - def get_raw_root() -> Path: """Return workspace/raw/github_activity_tracker/; creates dirs if missing.""" path = get_workspace_path("raw") / _RAW_APP_SLUG diff --git a/docs/Workspace.md b/docs/Workspace.md index 711fe5d..e5b6696 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -21,7 +21,6 @@ workspace/ # WORKSPACE_DIR (configurable via │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── <list_name>/<msg_id>.json ├── clang_github_activity/ # Markdown export for clang_github_tracker (md_export/) -├── clang_github_tracker/ # Optional CSV backfill (default: clang_github_tracker_backfill.csv) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── <list_name>/ │ └── messages/<msg_id>.json # Formatted cache (processed then removed) diff --git a/github_ops/git_ops.py b/github_ops/git_ops.py index 4d9ff0b..307592b 100644 --- a/github_ops/git_ops.py +++ b/github_ops/git_ops.py @@ -205,7 +205,8 @@ def clone_repo( cmd = ["git", "clone", clone_url, str(dest_dir)] if depth is not None: cmd.extend(["--depth", str(depth)]) - logger.info("Cloning %s -> %s", url_or_slug, dest_dir) + safe_url_or_slug = sanitize_git_output(url_or_slug) + logger.info("Cloning %s -> %s", safe_url_or_slug, dest_dir) try: subprocess.run( cmd, @@ -217,10 +218,9 @@ def clone_repo( timeout=GIT_CMD_TIMEOUT_SECONDS, ) except subprocess.TimeoutExpired as e: - safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] + safe_cmd: list[str] = ["git", "clone", safe_url_or_slug, str(dest_dir)] if depth is not None: safe_cmd.extend(["--depth", str(depth)]) - safe_url_or_slug = sanitize_git_output(url_or_slug) logger.warning( "git clone timed out after %ss (%s -> %s)", GIT_CMD_TIMEOUT_SECONDS, @@ -238,13 +238,13 @@ def clone_repo( safe_err_tail = sanitize_git_output(err_tail) logger.warning( "git clone failed (%s -> %s), returncode=%s, stderr/stdout_tail=%r", - url_or_slug, + safe_url_or_slug, dest_dir, e.returncode, safe_err_tail, ) # Never re-raise with the real cmd or raw output: they may embed the token. - safe_cmd: list[str] = ["git", "clone", url_or_slug, str(dest_dir)] + safe_cmd: list[str] = ["git", "clone", safe_url_or_slug, str(dest_dir)] if depth is not None: safe_cmd.extend(["--depth", str(depth)]) safe_stdout = sanitize_git_output(e.stdout or "") From e7384f40c1728e25e1571b470afaff4e0d8e14ae Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Tue, 7 Apr 2026 01:29:14 -0400 Subject: [PATCH 73/76] Update the code --- .../commands/backfill_clang_github_tracker.py | 58 +++++++++++-------- clang_github_tracker/services.py | 23 +++++++- clang_github_tracker/state_manager.py | 10 ---- clang_github_tracker/sync_raw.py | 32 ++++++---- 4 files changed, 75 insertions(+), 48 deletions(-) diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py index 2379dea..979638e 100644 --- a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -12,30 +12,22 @@ from django.core.management.base import BaseCommand, CommandError from clang_github_tracker import services as clang_services -from clang_github_tracker import state_manager as clang_state from clang_github_tracker.workspace import OWNER, REPO, get_raw_repo_dir from github_activity_tracker.sync.utils import ( normalize_issue_json, normalize_pr_json, - parse_datetime, ) +from core.utils.datetime_parsing import parse_iso_datetime as parse_datetime + +from clang_github_tracker.sync_raw import commit_date + logger = logging.getLogger(__name__) _SHA40 = re.compile(r"^[0-9a-fA-F]{40}$") _RAW_CHUNK_EVERY = 10_000 -def _commit_date_from_json(data: dict): - """Parse commit author/committer date from a GitHub API-style JSON dict.""" - commit = data.get("commit") or {} - author = commit.get("author") or commit.get("committer") or {} - date_str = author.get("date") - if not date_str: - return None - return parse_datetime(date_str) or clang_state.parse_iso(date_str) - - class Command(BaseCommand): """Load ``ClangGithubIssueItem`` / ``ClangGithubCommit`` from raw JSON dirs.""" @@ -59,20 +51,26 @@ def _backfill_from_raw(self) -> None: commit_rows: list[tuple[str, datetime | None]] = [] c_skip = 0 c_ins_total = c_upd_total = 0 - for c_read_n, p in enumerate(sorted(commits_dir.glob("*.json")), start=1): + for c_read_n, p in enumerate( + sorted(commits_dir.glob("*.json")), start=1 + ): try: data = json.loads(p.read_text(encoding="utf-8")) sha = (data.get("sha") or "").strip() if not _SHA40.match(sha): c_skip += 1 continue - commit_rows.append((sha, _commit_date_from_json(data))) - except Exception as e: # pylint: disable=broad-exception-caught + commit_rows.append((sha, commit_date(data))) + except ( + Exception + ) as e: # pylint: disable=broad-exception-caught logger.warning("skip commit file %s: %s", p, e) c_skip += 1 if c_read_n % _RAW_CHUNK_EVERY == 0: if commit_rows: - ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) + ins_c, upd_c = clang_services.upsert_commits_batch( + commit_rows + ) c_ins_total += ins_c c_upd_total += upd_c commit_rows.clear() @@ -95,14 +93,18 @@ def _backfill_from_raw(self) -> None: c_skip, ) - issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] + issue_rows: list[ + tuple[int, bool, datetime | None, datetime | None] + ] = [] i_ins_total = i_upd_total = 0 issues_dir = root / "issues" if issues_dir.is_dir(): i_skip = 0 i_ok = 0 - for i_read_n, p in enumerate(sorted(issues_dir.glob("*.json")), start=1): + for i_read_n, p in enumerate( + sorted(issues_dir.glob("*.json")), start=1 + ): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_issue_json(data) @@ -119,7 +121,9 @@ def _backfill_from_raw(self) -> None: ) ) i_ok += 1 - except Exception as e: # pylint: disable=broad-exception-caught + except ( + Exception + ) as e: # pylint: disable=broad-exception-caught logger.warning("skip issue file %s: %s", p, e) i_skip += 1 if i_read_n % _RAW_CHUNK_EVERY == 0: @@ -138,7 +142,9 @@ def _backfill_from_raw(self) -> None: i_upd_total, ) if issue_rows: - ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) i_ins_total += ins_i i_upd_total += upd_i issue_rows.clear() @@ -148,7 +154,9 @@ def _backfill_from_raw(self) -> None: if prs_dir.is_dir(): pr_skip = 0 pr_ok = 0 - for pr_read_n, p in enumerate(sorted(prs_dir.glob("*.json")), start=1): + for pr_read_n, p in enumerate( + sorted(prs_dir.glob("*.json")), start=1 + ): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_pr_json(data) @@ -165,7 +173,9 @@ def _backfill_from_raw(self) -> None: ) ) pr_ok += 1 - except Exception as e: # pylint: disable=broad-exception-caught + except ( + Exception + ) as e: # pylint: disable=broad-exception-caught logger.warning("skip pr file %s: %s", p, e) pr_skip += 1 if pr_read_n % _RAW_CHUNK_EVERY == 0: @@ -184,7 +194,9 @@ def _backfill_from_raw(self) -> None: i_upd_total, ) if issue_rows: - ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) + ins_i, upd_i = clang_services.upsert_issue_items_batch( + issue_rows + ) i_ins_total += ins_i i_upd_total += upd_i issue_rows.clear() diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index 42411fb..7643120 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -22,7 +22,9 @@ def _invalid_issue_number(n: object) -> bool: return isinstance(n, bool) or not isinstance(n, int) or n <= 0 -def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | None: +def _max_dt( + current: datetime | None, incoming: datetime | None +) -> datetime | None: """Return the later of two datetimes; ``None`` is treated as missing (never wins over a value).""" if current is None: return incoming @@ -60,7 +62,9 @@ def upsert_issue_item( ) -> tuple[ClangGithubIssueItem, bool]: """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" if _invalid_issue_number(number): - raise ValueError(f"issue number must be a positive integer, got {number!r}") + raise ValueError( + f"issue number must be a positive integer, got {number!r}" + ) existing = ClangGithubIssueItem.objects.filter(number=number).first() is_pr, gc, gu = _merge_issue_item_fields( existing, @@ -156,6 +160,17 @@ def upsert_commits_batch( Returns: (inserted, updated) counts across all batches. """ + if batch_size <= 0: + logger.warning( + "batch_size must be positive, using %s", DEFAULT_UPSERT_BATCH_SIZE + ) + batch_size = DEFAULT_UPSERT_BATCH_SIZE + if batch_size > len(rows): + logger.warning( + "batch_size is greater than the number of rows, using %s", + len(rows), + ) + batch_size = len(rows) merged: dict[str, datetime | None] = {} for sha, dt in rows: s = (sha or "").strip().lower() @@ -250,7 +265,9 @@ def upsert_issue_items_batch( _max_dt(prev_gu, gu), ) inserted = updated = 0 - items = [(n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items())] + items = [ + (n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items()) + ] for i in range(0, len(items), batch_size): di, du = _flush_issue_items_chunk(items[i : i + batch_size]) inserted += di diff --git a/clang_github_tracker/state_manager.py b/clang_github_tracker/state_manager.py index 318a2d5..36e8ddf 100644 --- a/clang_github_tracker/state_manager.py +++ b/clang_github_tracker/state_manager.py @@ -20,16 +20,6 @@ logger = logging.getLogger(__name__) -def parse_iso(s: str | None) -> datetime | None: - """Parse ISO datetime string; returns None if missing or invalid.""" - if not s or not isinstance(s, str) or not s.strip(): - return None - try: - return datetime.fromisoformat(s.strip().replace("Z", "+00:00")) - except (ValueError, TypeError): - return None - - def _aware_utc(dt: datetime | None) -> datetime | None: """Normalize ``dt`` to timezone-aware UTC, or return ``None``.""" if dt is None: diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index d5a456a..a25c1e6 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -19,12 +19,12 @@ from github_activity_tracker.sync.utils import ( normalize_issue_json, normalize_pr_json, - parse_datetime, ) + +from core.utils.datetime_parsing import parse_iso_datetime as parse_datetime from github_ops import get_github_client from github_ops.client import ConnectionException, RateLimitException -from clang_github_tracker import state_manager as clang_state from clang_github_tracker import services as clang_services from clang_github_tracker.workspace import OWNER, REPO @@ -45,14 +45,14 @@ def _valid_positive_issue_number(n: object) -> bool: return type(n) is int and n > 0 -def _commit_date(commit_data: dict) -> datetime | None: +def commit_date(commit_data: dict) -> datetime | None: """Extract author/committer date from GitHub commit payload.""" commit = commit_data.get("commit") or {} author = commit.get("author") or commit.get("committer") or {} - date_str = author.get("date") + date_str = author.get("date") or "" if not date_str: return None - return parse_datetime(date_str) or clang_state.parse_iso(date_str) + return parse_datetime(date_str) def sync_clang_github_activity( @@ -93,7 +93,7 @@ def sync_clang_github_activity( if sha: save_commit_raw_source(owner, repo, commit_data) commits_saved += 1 - committed_at = _commit_date(commit_data) + committed_at = commit_date(commit_data) try: clang_services.upsert_commit( str(sha).strip(), @@ -115,14 +115,18 @@ def sync_clang_github_activity( clang_services.upsert_issue_item( num, is_pull_request=True, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + github_created_at=parse_datetime( + flat.get("created_at") + ), + github_updated_at=parse_datetime( + flat.get("updated_at") + ), ) pr_numbers.append(num) else: - issue_number = (item.get("issue_info") or {}).get("number") or item.get( + issue_number = (item.get("issue_info") or {}).get( "number" - ) + ) or item.get("number") if issue_number is not None: save_issue_raw_source(owner, repo, item) flat = normalize_issue_json(item) @@ -131,8 +135,12 @@ def sync_clang_github_activity( clang_services.upsert_issue_item( num, is_pull_request=False, - github_created_at=parse_datetime(flat.get("created_at")), - github_updated_at=parse_datetime(flat.get("updated_at")), + github_created_at=parse_datetime( + flat.get("created_at") + ), + github_updated_at=parse_datetime( + flat.get("updated_at") + ), ) issue_numbers.append(num) From dfc7d694b9772f913f9298d5a813737ba3bc3d38 Mon Sep 17 00:00:00 2001 From: snowfox1003 <snowfox1003@gmail.com> Date: Tue, 7 Apr 2026 11:11:26 -0400 Subject: [PATCH 74/76] Fix: staging logic in clange github tracker, use core functions - #136 --- .../commands/backfill_clang_github_tracker.py | 40 +-- clang_github_tracker/services.py | 28 +- clang_github_tracker/sync_raw.py | 283 ++++++++++++++---- .../tests/test_state_manager.py | 18 -- clang_github_tracker/workspace.py | 6 +- config/test_settings.py | 1 - core/utils/datetime_parsing.py | 15 + docs/Workspace.md | 2 +- 8 files changed, 279 insertions(+), 114 deletions(-) diff --git a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py index 979638e..747ef65 100644 --- a/clang_github_tracker/management/commands/backfill_clang_github_tracker.py +++ b/clang_github_tracker/management/commands/backfill_clang_github_tracker.py @@ -51,9 +51,7 @@ def _backfill_from_raw(self) -> None: commit_rows: list[tuple[str, datetime | None]] = [] c_skip = 0 c_ins_total = c_upd_total = 0 - for c_read_n, p in enumerate( - sorted(commits_dir.glob("*.json")), start=1 - ): + for c_read_n, p in enumerate(sorted(commits_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) sha = (data.get("sha") or "").strip() @@ -61,16 +59,12 @@ def _backfill_from_raw(self) -> None: c_skip += 1 continue commit_rows.append((sha, commit_date(data))) - except ( - Exception - ) as e: # pylint: disable=broad-exception-caught + except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip commit file %s: %s", p, e) c_skip += 1 if c_read_n % _RAW_CHUNK_EVERY == 0: if commit_rows: - ins_c, upd_c = clang_services.upsert_commits_batch( - commit_rows - ) + ins_c, upd_c = clang_services.upsert_commits_batch(commit_rows) c_ins_total += ins_c c_upd_total += upd_c commit_rows.clear() @@ -93,18 +87,14 @@ def _backfill_from_raw(self) -> None: c_skip, ) - issue_rows: list[ - tuple[int, bool, datetime | None, datetime | None] - ] = [] + issue_rows: list[tuple[int, bool, datetime | None, datetime | None]] = [] i_ins_total = i_upd_total = 0 issues_dir = root / "issues" if issues_dir.is_dir(): i_skip = 0 i_ok = 0 - for i_read_n, p in enumerate( - sorted(issues_dir.glob("*.json")), start=1 - ): + for i_read_n, p in enumerate(sorted(issues_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_issue_json(data) @@ -121,9 +111,7 @@ def _backfill_from_raw(self) -> None: ) ) i_ok += 1 - except ( - Exception - ) as e: # pylint: disable=broad-exception-caught + except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip issue file %s: %s", p, e) i_skip += 1 if i_read_n % _RAW_CHUNK_EVERY == 0: @@ -142,9 +130,7 @@ def _backfill_from_raw(self) -> None: i_upd_total, ) if issue_rows: - ins_i, upd_i = clang_services.upsert_issue_items_batch( - issue_rows - ) + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) i_ins_total += ins_i i_upd_total += upd_i issue_rows.clear() @@ -154,9 +140,7 @@ def _backfill_from_raw(self) -> None: if prs_dir.is_dir(): pr_skip = 0 pr_ok = 0 - for pr_read_n, p in enumerate( - sorted(prs_dir.glob("*.json")), start=1 - ): + for pr_read_n, p in enumerate(sorted(prs_dir.glob("*.json")), start=1): try: data = json.loads(p.read_text(encoding="utf-8")) flat = normalize_pr_json(data) @@ -173,9 +157,7 @@ def _backfill_from_raw(self) -> None: ) ) pr_ok += 1 - except ( - Exception - ) as e: # pylint: disable=broad-exception-caught + except Exception as e: # pylint: disable=broad-exception-caught logger.warning("skip pr file %s: %s", p, e) pr_skip += 1 if pr_read_n % _RAW_CHUNK_EVERY == 0: @@ -194,9 +176,7 @@ def _backfill_from_raw(self) -> None: i_upd_total, ) if issue_rows: - ins_i, upd_i = clang_services.upsert_issue_items_batch( - issue_rows - ) + ins_i, upd_i = clang_services.upsert_issue_items_batch(issue_rows) i_ins_total += ins_i i_upd_total += upd_i issue_rows.clear() diff --git a/clang_github_tracker/services.py b/clang_github_tracker/services.py index 7643120..9cee1a0 100644 --- a/clang_github_tracker/services.py +++ b/clang_github_tracker/services.py @@ -11,6 +11,7 @@ from django.utils import timezone from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem +from core.utils.datetime_parsing import ensure_aware_utc logger = logging.getLogger(__name__) @@ -22,9 +23,7 @@ def _invalid_issue_number(n: object) -> bool: return isinstance(n, bool) or not isinstance(n, int) or n <= 0 -def _max_dt( - current: datetime | None, incoming: datetime | None -) -> datetime | None: +def _max_dt(current: datetime | None, incoming: datetime | None) -> datetime | None: """Return the later of two datetimes; ``None`` is treated as missing (never wins over a value).""" if current is None: return incoming @@ -62,9 +61,9 @@ def upsert_issue_item( ) -> tuple[ClangGithubIssueItem, bool]: """Create or update a ClangGithubIssueItem by ``number``. Returns (instance, created).""" if _invalid_issue_number(number): - raise ValueError( - f"issue number must be a positive integer, got {number!r}" - ) + raise ValueError(f"issue number must be a positive integer, got {number!r}") + github_created_at = ensure_aware_utc(github_created_at) + github_updated_at = ensure_aware_utc(github_updated_at) existing = ClangGithubIssueItem.objects.filter(number=number).first() is_pr, gc, gu = _merge_issue_item_fields( existing, @@ -98,6 +97,7 @@ def upsert_commit( sha_clean = (sha or "").strip().lower() if len(sha_clean) != 40: raise ValueError(f"commit sha must be 40 hex chars, got {sha_clean!r}") + github_committed_at = ensure_aware_utc(github_committed_at) existing = ClangGithubCommit.objects.filter(sha=sha_clean).first() merged_committed_at = _max_dt( existing.github_committed_at if existing else None, @@ -133,7 +133,10 @@ def _flush_commits_chunk( objs = [ ClangGithubCommit( sha=s, - github_committed_at=_max_dt(existing_committed.get(s), dt), + github_committed_at=_max_dt( + ensure_aware_utc(existing_committed.get(s)), + ensure_aware_utc(dt), + ), updated_at=now, ) for s, dt in pairs @@ -176,7 +179,8 @@ def upsert_commits_batch( s = (sha or "").strip().lower() if len(s) != 40: continue - merged[s] = _max_dt(merged.get(s), dt) + dt_a = ensure_aware_utc(dt) + merged[s] = _max_dt(merged.get(s), dt_a) inserted = updated = 0 items = list(merged.items()) for i in range(0, len(items), batch_size): @@ -206,6 +210,8 @@ def _flush_issue_items_chunk( now = timezone.now() objs = [] for n, is_pr, gc, gu in rows: + gc = ensure_aware_utc(gc) + gu = ensure_aware_utc(gu) m_is_pr, m_gc, m_gu = _merge_issue_item_fields( existing_by_num.get(n), is_pr, gc, gu ) @@ -254,6 +260,8 @@ def upsert_issue_items_batch( for num, is_pr, gc, gu in rows: if _invalid_issue_number(num): continue + gc = ensure_aware_utc(gc) + gu = ensure_aware_utc(gu) prev = merged.get(num) if prev is None: merged[num] = (is_pr, gc, gu) @@ -265,9 +273,7 @@ def upsert_issue_items_batch( _max_dt(prev_gu, gu), ) inserted = updated = 0 - items = [ - (n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items()) - ] + items = [(n, is_pr, gc, gu) for n, (is_pr, gc, gu) in sorted(merged.items())] for i in range(0, len(items), batch_size): di, du = _flush_issue_items_chunk(items[i : i + batch_size]) inserted += di diff --git a/clang_github_tracker/sync_raw.py b/clang_github_tracker/sync_raw.py index a25c1e6..f6df55a 100644 --- a/clang_github_tracker/sync_raw.py +++ b/clang_github_tracker/sync_raw.py @@ -1,13 +1,18 @@ """ Sync llvm/llvm-project to raw/github_activity_tracker and clang_github_tracker DB. -Uses github_activity_tracker.fetcher and raw_source; persists issue/PR/commit rows via services. +Staging: JSON is written under workspace/github_activity_tracker/<owner>/<repo>/ +(commits|issues|prs). After a successful DB upsert and raw write, the staging file is +removed. On any processing error the staging file is left for the next run. +Pending staging files are processed before any API fetch. """ from __future__ import annotations +import json import logging from datetime import datetime, timezone +from pathlib import Path from typing import Optional from github_activity_tracker import fetcher @@ -20,6 +25,14 @@ normalize_issue_json, normalize_pr_json, ) +from github_activity_tracker.workspace import ( + get_commit_json_path, + get_issue_json_path, + get_pr_json_path, + iter_existing_commit_jsons, + iter_existing_issue_jsons, + iter_existing_pr_jsons, +) from core.utils.datetime_parsing import parse_iso_datetime as parse_datetime from github_ops import get_github_client @@ -55,14 +68,191 @@ def commit_date(commit_data: dict) -> datetime | None: return parse_datetime(date_str) +def _write_staging_json(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2, default=str), encoding="utf-8") + + +def _promote_commit_staging( + owner: str, repo: str, staging_path: Path, commit_data: dict +) -> bool: + """ + Upsert commit to DB, write raw JSON, remove staging file. + + Returns True if fully successful. On failure the staging file is kept (except + when the payload cannot be processed — invalid sha — staging is removed). + """ + sha = commit_data.get("sha") + if not isinstance(sha, str) or not sha.strip(): + logger.warning( + "clang sync: drop staging commit (missing sha): %s", staging_path + ) + staging_path.unlink(missing_ok=True) + return False + committed_at = commit_date(commit_data) + try: + clang_services.upsert_commit( + str(sha).strip(), + github_committed_at=committed_at, + ) + except Exception as e: + logger.warning( + "clang sync: commit DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_commit_raw_source(owner, repo, commit_data) + except Exception: + logger.exception( + "clang sync: raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def _promote_issue_staging( + owner: str, repo: str, staging_path: Path, item: dict +) -> bool: + flat = normalize_issue_json(item) + num = flat.get("number") + if not _valid_positive_issue_number(num): + logger.warning( + "clang sync: drop staging issue (invalid number): %s", staging_path + ) + staging_path.unlink(missing_ok=True) + return False + try: + clang_services.upsert_issue_item( + num, + is_pull_request=False, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + except Exception as e: + logger.warning( + "clang sync: issue DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_issue_raw_source(owner, repo, item) + except Exception: + logger.exception( + "clang sync: issue raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def _promote_pr_staging(owner: str, repo: str, staging_path: Path, item: dict) -> bool: + flat = normalize_pr_json(item) + num = flat.get("number") + if not _valid_positive_issue_number(num): + logger.warning("clang sync: drop staging PR (invalid number): %s", staging_path) + staging_path.unlink(missing_ok=True) + return False + try: + clang_services.upsert_issue_item( + num, + is_pull_request=True, + github_created_at=parse_datetime(flat.get("created_at")), + github_updated_at=parse_datetime(flat.get("updated_at")), + ) + except Exception as e: + logger.warning( + "clang sync: PR DB upsert failed, keeping staging %s: %s", + staging_path, + e, + ) + return False + try: + save_pr_raw_source(owner, repo, item) + except Exception: + logger.exception( + "clang sync: PR raw write failed after DB upsert, keeping staging %s", + staging_path, + ) + return False + staging_path.unlink(missing_ok=True) + return True + + +def process_pending_clang_staging( + owner: str, + repo: str, +) -> tuple[int, list[int], list[int]]: + """ + Process workspace/github_activity_tracker/<owner>/<repo>/ commits, issues, prs. + + Returns (commits_promoted, issue_numbers, pr_numbers) for successful promotions. + """ + commits_promoted = 0 + issue_numbers: list[int] = [] + pr_numbers: list[int] = [] + + for path in sorted(iter_existing_commit_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging commit %s", path) + continue + if not isinstance(data, dict): + continue + if _promote_commit_staging(owner, repo, path, data): + commits_promoted += 1 + + for path in sorted(iter_existing_issue_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging issue %s", path) + continue + if not isinstance(data, dict): + continue + flat = normalize_issue_json(data) + num = flat.get("number") + if _promote_issue_staging( + owner, repo, path, data + ) and _valid_positive_issue_number(num): + issue_numbers.append(num) + + for path in sorted(iter_existing_pr_jsons(owner, repo), key=lambda p: p.name): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.exception("clang sync: unreadable staging PR %s", path) + continue + if not isinstance(data, dict): + continue + flat = normalize_pr_json(data) + num = flat.get("number") + if _promote_pr_staging( + owner, repo, path, data + ) and _valid_positive_issue_number(num): + pr_numbers.append(num) + + return commits_promoted, issue_numbers, pr_numbers + + def sync_clang_github_activity( start_commit: datetime | None = None, start_item: datetime | None = None, end_date: Optional[datetime] = None, ) -> tuple[int, list[int], list[int]]: """ - Fetch llvm/llvm-project commits, issues, PRs from GitHub and save to raw paths - and upsert ``ClangGithubCommit`` / ``ClangGithubIssueItem``. + Fetch llvm/llvm-project commits, issues, PRs from GitHub and upsert DB rows. + + Staging JSON lives under ``workspace/github_activity_tracker/<owner>/<repo>/``; + after a successful DB upsert and raw write under + ``workspace/raw/github_activity_tracker/...``, staging files are removed. + Pending staging files are processed before any API fetch. Args: start_commit: Start date for commits (None = from beginning). @@ -81,68 +271,61 @@ def sync_clang_github_activity( client = get_github_client(use="scraping") - commits_saved = 0 - issue_numbers: list[int] = [] - pr_numbers: list[int] = [] + pending_c, pending_i, pending_p = process_pending_clang_staging(owner, repo) + commits_saved = pending_c + issue_numbers: list[int] = list(pending_i) + pr_numbers: list[int] = list(pending_p) try: for commit_data in fetcher.fetch_commits_from_github( client, owner, repo, start_commit, end_date ): sha = commit_data.get("sha") - if sha: - save_commit_raw_source(owner, repo, commit_data) + if not isinstance(sha, str) or not sha.strip(): + continue + sha_clean = sha.strip() + staging_path = get_commit_json_path(owner, repo, sha_clean) + _write_staging_json(staging_path, commit_data) + if _promote_commit_staging(owner, repo, staging_path, commit_data): commits_saved += 1 - committed_at = commit_date(commit_data) - try: - clang_services.upsert_commit( - str(sha).strip(), - github_committed_at=committed_at, - ) - except ValueError as e: - logger.warning("skip commit DB upsert: %s", e) for item in fetcher.fetch_issues_and_prs_from_github( client, owner, repo, start_item, end_date ): if "pr_info" in item: pr_number = (item["pr_info"] or {}).get("number") - if pr_number is not None: - save_pr_raw_source(owner, repo, item) - flat = normalize_pr_json(item) - num = flat.get("number") - if _valid_positive_issue_number(num): - clang_services.upsert_issue_item( - num, - is_pull_request=True, - github_created_at=parse_datetime( - flat.get("created_at") - ), - github_updated_at=parse_datetime( - flat.get("updated_at") - ), - ) - pr_numbers.append(num) + if pr_number is None: + continue + if isinstance(pr_number, str) and pr_number.isdigit(): + pr_number = int(pr_number) + if type(pr_number) is not int or pr_number <= 0: + continue + staging_path = get_pr_json_path(owner, repo, pr_number) + _write_staging_json(staging_path, item) + flat = normalize_pr_json(item) + num = flat.get("number") + if _promote_pr_staging(owner, repo, staging_path, item) and ( + _valid_positive_issue_number(num) + ): + pr_numbers.append(num) else: - issue_number = (item.get("issue_info") or {}).get( + issue_number = (item.get("issue_info") or {}).get("number") or item.get( "number" - ) or item.get("number") - if issue_number is not None: - save_issue_raw_source(owner, repo, item) - flat = normalize_issue_json(item) - num = flat.get("number") - if _valid_positive_issue_number(num): - clang_services.upsert_issue_item( - num, - is_pull_request=False, - github_created_at=parse_datetime( - flat.get("created_at") - ), - github_updated_at=parse_datetime( - flat.get("updated_at") - ), - ) - issue_numbers.append(num) + ) + if issue_number is None: + continue + if isinstance(issue_number, str) and issue_number.isdigit(): + issue_number = int(issue_number) + if type(issue_number) is not int or issue_number <= 0: + continue + staging_path = get_issue_json_path(owner, repo, issue_number) + _write_staging_json(staging_path, item) + flat = normalize_issue_json(item) + num = flat.get("number") + if _promote_issue_staging(owner, repo, staging_path, item) and ( + _valid_positive_issue_number(num) + ): + issue_numbers.append(num) except (ConnectionException, RateLimitException) as e: logger.exception("clang_github_tracker sync failed: %s", e) diff --git a/clang_github_tracker/tests/test_state_manager.py b/clang_github_tracker/tests/test_state_manager.py index 20b8e51..cf0338a 100644 --- a/clang_github_tracker/tests/test_state_manager.py +++ b/clang_github_tracker/tests/test_state_manager.py @@ -9,24 +9,6 @@ from clang_github_tracker.models import ClangGithubCommit, ClangGithubIssueItem -def test_parse_iso_valid(): - """parse_iso returns datetime for valid ISO strings.""" - dt = clang_state.parse_iso("2024-01-15T10:30:00Z") - assert dt is not None - assert dt.year == 2024 and dt.month == 1 and dt.day == 15 - dt2 = clang_state.parse_iso("2024-06-01T00:00:00+00:00") - assert dt2 is not None - assert dt2.month == 6 - - -def test_parse_iso_invalid_or_empty(): - """parse_iso returns None for empty or invalid input.""" - assert clang_state.parse_iso(None) is None - assert clang_state.parse_iso("") is None - assert clang_state.parse_iso(" ") is None - assert clang_state.parse_iso("not-a-date") is None - - @pytest.mark.django_db def test_resolve_empty_db_no_since_until(): """Empty tables → None starts; end None until caller passes --until.""" diff --git a/clang_github_tracker/workspace.py b/clang_github_tracker/workspace.py index 3e7eef8..a7d495a 100644 --- a/clang_github_tracker/workspace.py +++ b/clang_github_tracker/workspace.py @@ -2,7 +2,7 @@ Workspace paths for clang_github_tracker: md export, raw GitHub JSON. Layout: - workspace/clang_github_activity/ + workspace/github_activity_tracker/ - md_export/ (generated Markdown for GitHub publish) workspace/raw/github_activity_tracker/<owner>/<repo>/ - commits/, issues/, prs/ @@ -15,7 +15,7 @@ from config.workspace import get_workspace_path -_APP_SLUG = "clang_github_activity" +_APP_SLUG = "github_activity_tracker" _RAW_APP_SLUG = "github_activity_tracker" # Repo we sync (raw only, no DB); from settings (env: CLANG_GITHUB_OWNER, CLANG_GITHUB_REPO) @@ -35,7 +35,7 @@ def _sanitize_segment(value: str, label: str) -> str: def get_workspace_root() -> Path: - """Return workspace/clang_github_activity/; creates dir if missing.""" + """Return workspace/clang_github_tracker/; creates dir if missing.""" return get_workspace_path(_APP_SLUG) diff --git a/config/test_settings.py b/config/test_settings.py index a175bdb..c1c3f83 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -50,7 +50,6 @@ for _slug in ( "github_activity_tracker", "boost_library_tracker", - "clang_github_activity", "clang_github_tracker", "discord_activity_tracker", "shared", diff --git a/core/utils/datetime_parsing.py b/core/utils/datetime_parsing.py index fa4710c..f1bfb1e 100644 --- a/core/utils/datetime_parsing.py +++ b/core/utils/datetime_parsing.py @@ -4,6 +4,21 @@ from datetime import datetime, timezone +from django.utils import timezone as django_timezone + + +def ensure_aware_utc(dt: datetime | None) -> datetime | None: + """ + Normalize a datetime for ``DateTimeField`` when ``USE_TZ`` is True. + + Naive values are treated as UTC. Aware values are converted to UTC. + """ + if dt is None: + return None + if django_timezone.is_naive(dt): + return django_timezone.make_aware(dt, django_timezone.utc) + return dt.astimezone(django_timezone.utc) + def parse_iso_datetime(raw: str | None) -> datetime | None: """ diff --git a/docs/Workspace.md b/docs/Workspace.md index e5b6696..21afe13 100644 --- a/docs/Workspace.md +++ b/docs/Workspace.md @@ -20,7 +20,7 @@ workspace/ # WORKSPACE_DIR (configurable via │ │ └── prs/<number>.json │ └── boost_mailing_list_tracker/ # Raw API responses (kept, not removed) │ └── <list_name>/<msg_id>.json -├── clang_github_activity/ # Markdown export for clang_github_tracker (md_export/) +├── clang_github_tracker/ # Markdown export for clang_github_tracker (md_export/) ├── boost_mailing_list_tracker/ # Mailing list messages (see below) │ └── <list_name>/ │ └── messages/<msg_id>.json # Formatted cache (processed then removed) From 57d9334990537f24e15ead0d49eab5e2322eb75a Mon Sep 17 00:00:00 2001 From: Leo Chen <leo.chen0412@outlook.com> Date: Mon, 20 Apr 2026 11:18:26 -0700 Subject: [PATCH 75/76] Fix: lint/format error --- cppa_user_tracker/services.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 2317586..f285323 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -386,6 +386,8 @@ def get_or_create_wg21_paper_author_profile( if email_val: add_email(profile, email_val, is_primary=True) return profile, True + + def get_or_create_youtube_speaker( external_id: str, display_name: str = "", From e122eb7b4a794ba5031b68d8b956dc3cd52dc6d6 Mon Sep 17 00:00:00 2001 From: Leo Chen <leo.chen0412@outlook.com> Date: Mon, 20 Apr 2026 11:36:02 -0700 Subject: [PATCH 76/76] Fix: compose error --- ..._wg21_author_alias_youtubespeaker_external_id.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py diff --git a/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py new file mode 100644 index 0000000..fcdc4f2 --- /dev/null +++ b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py @@ -0,0 +1,13 @@ +# Merge parallel branches from 0004: WG21 author_alias vs YouTube speaker chain. + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), + ("cppa_user_tracker", "0007_youtubespeaker_external_id"), + ] + + operations = []