From e3c1a65d536f0af523bbf4ade6936163e4cee4db Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 09:23:21 -0700 Subject: [PATCH 01/20] Initial wg21-paper-tracker added --- config/settings.py | 8 + .../0005_alter_slackuser_slack_user_id.py | 18 ++ cppa_user_tracker/services.py | 27 +- docs/Schema.md | 20 +- docs/operations/WG21_Cloud_Run.md | 61 ++++ parse_test.py | 8 + parse_year.py | 20 ++ requirements.txt | 6 + wg21_paper_tracker/__init__.py | 0 wg21_paper_tracker/admin.py | 31 +++ wg21_paper_tracker/apps.py | 7 + wg21_paper_tracker/cloud_run_job/Dockerfile | 24 ++ .../cloud_run_job/converters/__init__.py | 9 + .../converters/docling_converter.py | 62 +++++ .../converters/openai_converter.py | 260 ++++++++++++++++++ .../converters/pdfplumber_converter.py | 92 +++++++ wg21_paper_tracker/cloud_run_job/main.py | 100 +++++++ .../cloud_run_job/requirements.txt | 6 + wg21_paper_tracker/fetcher.py | 156 +++++++++++ wg21_paper_tracker/management/__init__.py | 0 .../management/commands/__init__.py | 0 .../commands/run_wg21_paper_tracker.py | 56 ++++ wg21_paper_tracker/migrations/0001_initial.py | 69 +++++ wg21_paper_tracker/migrations/__init__.py | 0 wg21_paper_tracker/models.py | 76 +++++ wg21_paper_tracker/pipeline.py | 174 ++++++++++++ wg21_paper_tracker/services.py | 77 ++++++ wg21_paper_tracker/workspace.py | 23 ++ .../management/commands/run_all_collectors.py | 1 + 29 files changed, 1385 insertions(+), 6 deletions(-) create mode 100644 cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py create mode 100644 docs/operations/WG21_Cloud_Run.md create mode 100644 parse_test.py create mode 100644 parse_year.py create mode 100644 wg21_paper_tracker/__init__.py create mode 100644 wg21_paper_tracker/admin.py create mode 100644 wg21_paper_tracker/apps.py create mode 100644 wg21_paper_tracker/cloud_run_job/Dockerfile create mode 100644 wg21_paper_tracker/cloud_run_job/converters/__init__.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/docling_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/openai_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py create mode 100644 wg21_paper_tracker/cloud_run_job/main.py create mode 100644 wg21_paper_tracker/cloud_run_job/requirements.txt create mode 100644 wg21_paper_tracker/fetcher.py create mode 100644 wg21_paper_tracker/management/__init__.py create mode 100644 wg21_paper_tracker/management/commands/__init__.py create mode 100644 wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py create mode 100644 wg21_paper_tracker/migrations/0001_initial.py create mode 100644 wg21_paper_tracker/migrations/__init__.py create mode 100644 wg21_paper_tracker/models.py create mode 100644 wg21_paper_tracker/pipeline.py create mode 100644 wg21_paper_tracker/services.py create mode 100644 wg21_paper_tracker/workspace.py diff --git a/config/settings.py b/config/settings.py index d45b438..ae4a50f 100644 --- a/config/settings.py +++ b/config/settings.py @@ -48,6 +48,7 @@ "cppa_slack_transcript_tracker", "cppa_slack_tracker", "discord_activity_tracker", + "wg21_paper_tracker", ] MIDDLEWARE = [ @@ -140,6 +141,7 @@ "cppa_slack_tracker", "discord_activity_tracker", "boost_mailing_list_tracker", + "wg21_paper_tracker", "shared", ) WORKSPACE_DIR.mkdir(parents=True, exist_ok=True) @@ -214,6 +216,12 @@ ) ).resolve() +# WG21 Paper Tracker Configuration +WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() +GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() +GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() +WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "").strip() + # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) LOG_FILE = env("LOG_FILE", default="app.log") diff --git a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py new file mode 100644 index 0000000..f1cde2c --- /dev/null +++ b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.28 on 2026-03-09 15:35 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('cppa_user_tracker', '0004_alter_slackuser_slack_user_id_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='slackuser', + name='slack_user_id', + field=models.CharField(max_length=64, unique=True), + ), + ] diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index a583894..35503f4 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -26,6 +26,7 @@ MailingListProfile, SlackUser, DiscordProfile, + WG21PaperAuthorProfile, ) @@ -49,7 +50,9 @@ def get_or_create_identity( """Get or create an Identity by display_name. If exists, updates description from defaults.""" lookup = {"display_name": display_name} defaults = defaults or {"description": description} - identity, created = Identity.objects.get_or_create(defaults=defaults, **lookup) + identity, created = Identity.objects.get_or_create( + defaults=defaults, **lookup + ) if ( not created and "description" in defaults @@ -247,7 +250,9 @@ def _get_next_negative_github_account_id() -> int: @transaction.atomic -def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool]: +def get_or_create_slack_user( + user_data: dict[str, Any], +) -> tuple[SlackUser, bool]: """Get or create a SlackUser from Slack API user data. Returns (SlackUser, created). If the user exists, updates username, display_name, and avatar_url from user_data. @@ -260,7 +265,9 @@ def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool raise ValueError("Slack user ID ('id') is required") profile = user_data.get("profile") or {} username = (user_data.get("name") or "").strip() - display_name = (user_data.get("real_name") or user_data.get("name") or "").strip() + display_name = ( + user_data.get("real_name") or user_data.get("name") or "" + ).strip() avatar_url = (profile.get("image_72") or "").strip() user, created = SlackUser.objects.get_or_create( slack_user_id=user_id, @@ -303,7 +310,9 @@ def get_or_create_unknown_github_account( ).first() if existing is not None: if email_str and not existing.emails.filter(email=email_str).exists(): - add_email(existing, email_str, is_primary=not existing.emails.exists()) + add_email( + existing, email_str, is_primary=not existing.emails.exists() + ) return existing, False next_id = _get_next_negative_github_account_id() account = get_or_create_github_account( @@ -350,3 +359,13 @@ def get_or_create_discord_profile( profile.is_bot = is_bot profile.save() return profile, created + + +def get_or_create_wg21_paper_author_profile( + display_name: str, +) -> tuple[Any, bool]: + """Get or create a WG21PaperAuthorProfile by display_name.""" + display_name_val = (display_name or "").strip() + return WG21PaperAuthorProfile.objects.get_or_create( + display_name=display_name_val, + ) diff --git a/docs/Schema.md b/docs/Schema.md index 3f9fa87..308a662 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -611,6 +611,7 @@ erDiagram erDiagram Direction LR WG21PaperAuthorProfile ||--o{ WG21PaperAuthor : "author" + WG21Mailing ||--o{ WG21Paper : "has" WG21PaperAuthor }o--|| WG21Paper : "has" WG21PaperAuthor { @@ -620,12 +621,23 @@ erDiagram datetime created_at } + WG21Mailing { + int id PK + string mailing_date UK "IX" + string title + datetime created_at + datetime updated_at + } + WG21Paper { int id PK string paper_id UK "IX" string url string title "IX" - date publication_date "IX" + date document_date "IX" + int mailing_id FK "IX" + string subgroup "IX" + boolean is_downloaded "IX" datetime created_at datetime updated_at } @@ -633,6 +645,8 @@ erDiagram **Note:** **WG21PaperAuthorProfile** extends `BaseProfile` (section 1). `profile_id` in WG21PaperAuthor references this profile; each paper can have multiple authors. +**Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing. + **Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. --- @@ -746,7 +760,8 @@ erDiagram | **SlackMessage** | Message in a channel (ts, slack_user_id, message, thread_ts). | 6 | | **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | | **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | -| **WG21Paper** | WG21 paper (paper_id, url, title, publication_date). | 7 | +| **WG21Mailing** | WG21 mailing release (mailing_date, title). | 7 | +| **WG21Paper** | WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded). | 7 | | **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | | **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | | **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | @@ -790,5 +805,6 @@ erDiagram | SlackChannel | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Contains / has many | | SlackUser | SlackMessage, SlackChannelMembership, SlackChannelMembershipChangeLog | Author / member / user | | SlackChannel | SlackUser | Creator (many-to-one) | +| WG21Mailing | WG21Paper | Has many papers | | WG21PaperAuthorProfile | WG21PaperAuthor | Author (has many) | | WG21Paper | WG21PaperAuthor | Has many authors | diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md new file mode 100644 index 0000000..257e2bc --- /dev/null +++ b/docs/operations/WG21_Cloud_Run.md @@ -0,0 +1,61 @@ +# WG21 Paper Conversion Cloud Run Job + +The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. + +The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job via the Google Cloud Run API when new papers are downloaded. + +## 1. Setup Google Cloud Storage + +Create a GCS bucket (e.g., `wg21-data-collector`). + +Ensure your Django app has the following environment variables configured: +- `WG21_GCS_BUCKET`: The name of the GCS bucket. +- `GCP_PROJECT_ID`: Your GCP project ID. +- `WG21_CLOUD_RUN_JOB_NAME`: (Optional, defaults to `wg21-convert`) The name of the deployed Cloud Run job. +- `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. + +## 2. Build and Push the Docker Image + +Navigate to the Cloud Run job directory: + +```bash +cd wg21_paper_tracker/cloud_run_job/ +``` + +Build the Docker image. Replace `[PROJECT_ID]` with your GCP Project ID: + +```bash +docker build -t gcr.io/[PROJECT_ID]/wg21-convert . +``` + +Push the image to Google Container Registry (or Artifact Registry): + +```bash +docker push gcr.io/[PROJECT_ID]/wg21-convert +``` + +## 3. Create the Cloud Run Job + +Create the job in Google Cloud. We recommend allocating sufficient memory and CPU since Docling and PDFPlumber are resource-intensive. + +```bash +gcloud run jobs create wg21-convert \ + --image gcr.io/[PROJECT_ID]/wg21-convert \ + --memory 8Gi \ + --cpu 4 \ + --region us-central1 \ + --set-env-vars WG21_GCS_BUCKET=wg21-data-collector,OPENROUTER_API_KEY=your_key +``` + +## 4. Service Account & IAM Permissions + +1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. +2. **GCS Access:** Both the Django application and the Cloud Run job require read/write access to the GCS bucket (`roles/storage.objectAdmin`). + +## 5. Flow Summary + +1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. +2. It checks the WG21 site for new mailings. +3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_papers//`. +4. It calls the Cloud Run API to execute `wg21-convert`. +5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/parse_test.py b/parse_test.py new file mode 100644 index 0000000..19dd034 --- /dev/null +++ b/parse_test.py @@ -0,0 +1,8 @@ +import re +text = """- [2026-01 mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-01) +- [2026-02 pre-Croydon mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-02) +- [2026](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/) N5034-N????""" + +pattern = re.compile(r'\[([^\]]+)\]\([^#]+#mailing(\d{4}-\d{2})\)') +for m in pattern.finditer(text): + print(m.groups()) diff --git a/parse_year.py b/parse_year.py new file mode 100644 index 0000000..1f6277f --- /dev/null +++ b/parse_year.py @@ -0,0 +1,20 @@ +import requests +from bs4 import BeautifulSoup + +response = requests.get('https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/') +soup = BeautifulSoup(response.text, 'html.parser') + +anchor = soup.find('a', id='mailing2026-02') +if not anchor: + print("Anchor not found") +else: + table = anchor.find_next('table') + rows = table.find_all('tr') + print(f"Found {len(rows)} rows in table after anchor") + for row in rows[:3]: + cells = [c.text.strip() for c in row.find_all(['th', 'td'])] + print(cells) + # Also print links in first cell + if row.find('td'): + links = row.find_all('td')[0].find_all('a') + print("Links:", [l['href'] for l in links]) diff --git a/requirements.txt b/requirements.txt index 1b3f84d..289d486 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,9 @@ redis>=5.0 slack-bolt>=1.18 pytz>=2024.1 selenium>=4.35 + +# wg21_paper_tracker app +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +google-cloud-run>=0.10.1 +google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/__init__.py b/wg21_paper_tracker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py new file mode 100644 index 0000000..a22358d --- /dev/null +++ b/wg21_paper_tracker/admin.py @@ -0,0 +1,31 @@ +from django.contrib import admin +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@admin.register(WG21Mailing) +class WG21MailingAdmin(admin.ModelAdmin): + list_display = ("mailing_date", "title", "created_at", "updated_at") + search_fields = ("mailing_date", "title") + ordering = ("-mailing_date",) + + +class WG21PaperAuthorInline(admin.TabularInline): + model = WG21PaperAuthor + extra = 1 + raw_id_fields = ("profile",) + + +@admin.register(WG21Paper) +class WG21PaperAdmin(admin.ModelAdmin): + list_display = ("paper_id", "title", "document_date", "mailing", "subgroup", "is_downloaded") + search_fields = ("paper_id", "title", "url", "subgroup") + list_filter = ("is_downloaded", "subgroup", "mailing") + ordering = ("-document_date", "-paper_id") + inlines = [WG21PaperAuthorInline] + + +@admin.register(WG21PaperAuthor) +class WG21PaperAuthorAdmin(admin.ModelAdmin): + list_display = ("paper", "profile", "created_at") + search_fields = ("paper__paper_id", "profile__display_name") + raw_id_fields = ("paper", "profile") diff --git a/wg21_paper_tracker/apps.py b/wg21_paper_tracker/apps.py new file mode 100644 index 0000000..d6f09d9 --- /dev/null +++ b/wg21_paper_tracker/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class Wg21PaperTrackerConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "wg21_paper_tracker" + verbose_name = "WG21 Paper Tracker" diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile new file mode 100644 index 0000000..21b51ef --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/Dockerfile @@ -0,0 +1,24 @@ +# Use an official Python runtime as a parent image +FROM python:3.11-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies required by converters (e.g. Poppler for PDF image extraction) +RUN apt-get update && apt-get install -y --no-install-recommends \ + poppler-utils \ + libgl1-mesa-glx \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY . . + +# Run the main script +CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/__init__.py b/wg21_paper_tracker/cloud_run_job/converters/__init__.py new file mode 100644 index 0000000..515d30a --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/__init__.py @@ -0,0 +1,9 @@ +""" +PDF to Markdown converters module. +""" + +from .docling_converter import convert_with_docling +from .pdfplumber_converter import convert_with_pdfplumber +from .openai_converter import convert_with_openai + +__all__ = ["convert_with_docling", "convert_with_pdfplumber", "convert_with_openai"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py new file mode 100644 index 0000000..b113332 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -0,0 +1,62 @@ +""" +Docling-based PDF to Markdown converter. +""" + +from pathlib import Path +from typing import Optional +import logging +logger = logging.getLogger(__name__) + +try: + from docling.document_converter import DocumentConverter # type: ignore[import-untyped] + from docling.datamodel.base_models import InputFormat # type: ignore[import-untyped] + + DOCLING_AVAILABLE = True +except ImportError: + DocumentConverter = None # type: ignore[assignment,misc] + InputFormat = None # type: ignore[assignment,misc] + DOCLING_AVAILABLE = False + logger.warning("Docling not available. Install with: pip install docling") + + +def convert_with_docling(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using Docling. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not DOCLING_AVAILABLE or DocumentConverter is None: + logger.error("Docling is not available") + return None + + try: + logger.info(f"Attempting Docling conversion for: {pdf_path.name}") + + # Initialize converter + converter = DocumentConverter() + + # Convert PDF to document + result = converter.convert(pdf_path) + + # Extract markdown + markdown_content = result.document.export_to_markdown() + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"Docling conversion successful for: {pdf_path.name}") + logger.info(f"Extracted {len(markdown_content)} characters") + return markdown_content + else: + logger.warning( + f"Docling conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"Docling conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py new file mode 100644 index 0000000..211ade7 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -0,0 +1,260 @@ +""" +OpenAI/OpenRouter-based PDF to Markdown converter with OCR. +""" + +import base64 +from pathlib import Path +from typing import Optional +import requests +import logging +logger = logging.getLogger(__name__) + +# Base configuration fallback +import os +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" +OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") + +try: + from pdf2image import convert_from_path + from PIL import Image, ImageOps + import io + + PDF2IMAGE_AVAILABLE = True +except ImportError: + PDF2IMAGE_AVAILABLE = False + logger.warning( + "pdf2image/PIL not available. Install with: pip install pdf2image pillow" + ) + + +def pdf_to_images(pdf_path: Path) -> list[Image.Image]: + """ + Convert PDF pages to images. + + Note: pdf2image should automatically handle PDF rotation metadata, + but we also apply additional rotation correction in correct_image_rotation(). + + Args: + pdf_path: Path to the PDF file. + + Returns: + List of PIL Image objects. + """ + if not PDF2IMAGE_AVAILABLE: + logger.error("pdf2image is not available") + return [] + + try: + logger.info(f"Converting PDF to images: {pdf_path.name}") + # pdf2image should respect PDF rotation, but we'll also check EXIF data + images = convert_from_path(pdf_path, dpi=200) + logger.info(f"Converted {len(images)} pages to images") + return images + except Exception as e: + logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) + return [] + + +def correct_image_rotation(image: Image.Image) -> Image.Image: + """ + Correct image rotation using EXIF data and heuristics. + + Args: + image: PIL Image object. + + Returns: + Corrected PIL Image object. + """ + try: + # First, try to correct using EXIF orientation data + # This handles images that have rotation metadata + corrected_image = ImageOps.exif_transpose(image) + + # If the image was rotated, log it + if corrected_image != image: + logger.debug("Image rotation corrected using EXIF data") + return corrected_image + + # If no EXIF data, check if image might be rotated + # For PDF pages, we can check if width > height suggests landscape + # But we'll keep the original orientation as PDFs can be in any orientation + # The OpenAI vision model can handle rotated text, but it's better to correct it + + return corrected_image + + except Exception as e: + logger.warning(f"Error correcting image rotation: {str(e)}") + return image + + +def image_to_base64(image: Image.Image) -> str: + """ + Convert PIL Image to base64 string. + Automatically corrects rotation before encoding. + + Args: + image: PIL Image object. + + Returns: + Base64 encoded string. + """ + # Correct rotation before encoding + corrected_image = correct_image_rotation(image) + + buffered = io.BytesIO() + corrected_image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode() + return img_str + + +def convert_page_with_openai( + image_base64: str, page_num: int, total_pages: int +) -> Optional[str]: + """ + Convert a single page image to markdown using OpenAI/OpenRouter. + + Args: + image_base64: Base64 encoded image string. + page_num: Current page number. + total_pages: Total number of pages. + + Returns: + Markdown content for the page, or None if conversion fails. + """ + if not OPENROUTER_API_KEY: + logger.error("OpenRouter API key is not set") + return None + + try: + logger.info(f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter") + + url = f"{OPENROUTER_BASE_URL}/chat/completions" + headers = { + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Content-Type": "application/json", + } + + payload = { + "model": OPENROUTER_MODEL, + "messages": [ + { + "role": "system", + "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_base64}" + }, + }, + ], + }, + ], + "max_tokens": 4000, + } + + response = requests.post(url, json=payload, headers=headers, timeout=120) + response.raise_for_status() + + result = response.json() + markdown_content = result["choices"][0]["message"]["content"] + + logger.info(f"Successfully converted page {page_num} with OpenAI/OpenRouter") + return markdown_content + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + +def convert_with_openai(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using OpenAI/OpenRouter with OCR. + Processes each page as an image. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not OPENROUTER_API_KEY: + logger.error("OpenRouter API key is not set in environment variables") + return None + + if not PDF2IMAGE_AVAILABLE: + logger.error("pdf2image is required for OpenAI conversion") + return None + + try: + logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") + + # Convert PDF to images + images = pdf_to_images(pdf_path) + if not images: + logger.error(f"Failed to convert PDF to images: {pdf_path.name}") + return None + + total_pages = len(images) + markdown_parts = [] + + # Process each page + for page_num, image in enumerate(images, 1): + try: + # Convert image to base64 + image_base64 = image_to_base64(image) + + # Convert page with OpenAI + page_markdown = convert_page_with_openai( + image_base64, page_num, total_pages + ) + + if page_markdown: + markdown_parts.append(page_markdown) + markdown_parts.append("\n\n") + else: + logger.warning(f"Failed to convert page {page_num} with OpenAI") + markdown_parts.append( + f"## Page {page_num}\n\n*[Conversion failed for this page]*\n\n" + ) + + except Exception as e: + logger.error( + f"Error processing page {page_num}: {str(e)}", exc_info=True + ) + markdown_parts.append( + f"## Page {page_num}\n\n*[Error processing this page: {str(e)}]*\n\n" + ) + continue + + markdown_content = "".join(markdown_parts) + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") + logger.info( + f"Extracted {len(markdown_content)} characters from {total_pages} pages" + ) + return markdown_content + else: + logger.warning( + f"OpenAI/OpenRouter conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for {pdf_path.name}: {str(e)}", + exc_info=True, + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py new file mode 100644 index 0000000..31073d1 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -0,0 +1,92 @@ +""" +PDFPlumber-based PDF to Markdown converter. +""" + +from pathlib import Path +from typing import Optional +import logging +logger = logging.getLogger(__name__) + +try: + import pdfplumber + + PDFPLUMBER_AVAILABLE = True +except ImportError: + PDFPLUMBER_AVAILABLE = False + logger.warning("PDFPlumber not available. Install with: pip install pdfplumber") + + +def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: + """ + Convert PDF to Markdown using PDFPlumber. + + Args: + pdf_path: Path to the PDF file. + + Returns: + Markdown content as string, or None if conversion fails. + """ + if not PDFPLUMBER_AVAILABLE: + logger.error("PDFPlumber is not available") + return None + + try: + logger.info(f"Attempting PDFPlumber conversion for: {pdf_path.name}") + + markdown_parts = [] + + with pdfplumber.open(pdf_path) as pdf: + total_pages = len(pdf.pages) + logger.info(f"Processing {total_pages} pages with PDFPlumber") + + for page_num, page in enumerate(pdf.pages, 1): + try: + # Extract text from page + text = page.extract_text() + + if text: + markdown_parts.append(text) + markdown_parts.append("\n\n") + + # Extract tables if any + tables = page.extract_tables() + if tables: + for table in tables: + if table: + markdown_parts.append("\n### Table\n\n") + # Convert table to markdown format + for row in table: + if row: + markdown_parts.append( + "| " + + " | ".join( + str(cell) if cell else "" + for cell in row + ) + + " |\n" + ) + markdown_parts.append("\n") + + except Exception as e: + logger.warning( + f"Error processing page {page_num} of {pdf_path.name}: {str(e)}" + ) + continue + + markdown_content = "".join(markdown_parts) + + if markdown_content and len(markdown_content.strip()) > 0: + logger.info(f"PDFPlumber conversion successful for: {pdf_path.name}") + logger.info(f"Extracted {len(markdown_content)} characters") + return markdown_content + else: + logger.warning( + f"PDFPlumber conversion returned empty content for: {pdf_path.name}" + ) + return None + + except Exception as e: + logger.error( + f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + ) + return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py new file mode 100644 index 0000000..952124b --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -0,0 +1,100 @@ +import os +import logging +from pathlib import Path +import tempfile +from google.cloud import storage + +from converters.docling_converter import convert_with_docling +from converters.pdfplumber_converter import convert_with_pdfplumber +from converters.openai_converter import convert_with_openai + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +MIN_CONTENT_LENGTH = 50 + +def is_content_valid(content: str) -> bool: + if not content: + return False + content_stripped = content.strip() + if len(content_stripped) < MIN_CONTENT_LENGTH: + return False + error_patterns = [ + "traceback", "exception:", "error:", "failed to", "unable to convert", "conversion failed", "error processing" + ] + content_lower = content_stripped.lower() + first_part = content_lower[:1000] + for pattern in error_patterns: + if pattern in first_part: + if pattern.startswith("error:") or pattern.startswith("exception:"): + return False + idx = first_part.find(pattern) + if idx < 100: + return False + return True + +def convert_pdf_to_md(pdf_path: Path) -> str: + logger.info("Attempting Docling conversion...") + content = convert_with_docling(pdf_path) + if is_content_valid(content): + return content + + logger.info("Attempting PDFPlumber conversion...") + content = convert_with_pdfplumber(pdf_path) + if is_content_valid(content): + return content + + logger.info("Attempting OpenAI conversion...") + content = convert_with_openai(pdf_path) + if is_content_valid(content): + return content + + return "" + +def main(): + bucket_name = os.getenv("WG21_GCS_BUCKET") + if not bucket_name: + logger.error("WG21_GCS_BUCKET env var not set.") + return + + client = storage.Client() + bucket = client.bucket(bucket_name) + + raw_prefix = "raw/wg21_papers/" + converted_prefix = "converted/wg21_papers/" + + blobs = client.list_blobs(bucket, prefix=raw_prefix) + + with tempfile.TemporaryDirectory() as tmpdir: + for blob in blobs: + if not blob.name.lower().endswith(".pdf"): + continue + + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + relative_path = blob.name[len(raw_prefix):] + md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" + md_blob_name = f"{converted_prefix}{md_relative_path}" + + md_blob = bucket.blob(md_blob_name) + if md_blob.exists(): + logger.info("Skipping %s, MD already exists.", blob.name) + continue + + local_pdf_path = Path(tmpdir) / "temp.pdf" + logger.info("Downloading %s to process...", blob.name) + blob.download_to_filename(str(local_pdf_path)) + + logger.info("Converting %s...", blob.name) + md_content = convert_pdf_to_md(local_pdf_path) + + if md_content: + md_blob.upload_from_string(md_content, content_type="text/markdown") + logger.info("Successfully converted and uploaded %s", md_blob_name) + else: + logger.error("Failed to convert %s", blob.name) + + if local_pdf_path.exists(): + local_pdf_path.unlink() + +if __name__ == "__main__": + main() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt new file mode 100644 index 0000000..0a00731 --- /dev/null +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -0,0 +1,6 @@ +docling>=1.0.0 +pdfplumber>=0.10.0 +pdf2image>=1.16.0 +Pillow>=10.0.0 +requests>=2.31.0 +google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py new file mode 100644 index 0000000..e254223 --- /dev/null +++ b/wg21_paper_tracker/fetcher.py @@ -0,0 +1,156 @@ +""" +Fetcher for WG21 Papers. +Scrapes the WG21 papers index and specific mailing tables. +""" + +import re +import urllib.parse +from typing import Optional + +import requests +from bs4 import BeautifulSoup +from django.utils.dateparse import parse_date + +import logging +logger = logging.getLogger(__name__) + +BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" + + +def fetch_all_mailings() -> list[dict]: + """ + Fetch the main index and extract all mailings. + Returns a list of dicts: + - mailing_date (e.g. '2025-02') + - title (e.g. '2025-02 pre-Hagenberg mailing') + - year (e.g. '2025') + List is in the order found on the page (usually newest first). + """ + logger.info("Fetching WG21 main index: %s/", BASE_URL) + try: + response = requests.get(f"{BASE_URL}/", timeout=30) + response.raise_for_status() + except Exception as e: + logger.error("Failed to fetch WG21 index: %s", e) + return [] + + # The mailings are listed in a markdown-like syntax or links + # Typically: 2025-02 pre-Hagenberg mailing + # Let's parse with BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + mailings = [] + + # We look for links pointing to year/#mailingYYYY-MM + pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$") + + for a in soup.find_all("a", href=True): + href = a["href"] + match = pattern.search(href) + if match: + year, mailing_date = match.groups() + title = a.text.strip() + mailings.append({ + "mailing_date": mailing_date, + "title": title, + "year": year + }) + + return mailings + + +def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: + """ + Fetch the papers for a specific mailing from the year page. + Returns a list of paper dicts. + """ + url = f"{BASE_URL}/{year}/" + logger.info("Fetching mailing %s from %s", mailing_date, url) + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + except Exception as e: + logger.error("Failed to fetch year page %s: %s", year, e) + return [] + + soup = BeautifulSoup(response.text, "html.parser") + anchor_id = f"mailing{mailing_date}" + anchor = soup.find(id=anchor_id) + if not anchor: + logger.warning("Anchor %s not found on %s", anchor_id, url) + return [] + + table = anchor.find_next("table") + if not table: + logger.warning("No table found after anchor %s", anchor_id) + return [] + + paper_urls = [] + paper_pattern = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) + + for row in table.find_all("tr"): + cells = row.find_all(["td", "th"]) + if not cells or any(cell.get("colspan") for cell in cells): + continue + + # Usually: Number, Title, Author, Date, Subgroup + if len(cells) >= 1: + first_cell = cells[0] + for link in first_cell.find_all("a", href=True): + href = link.get("href", "") + match = paper_pattern.search(href) + if match: + if href.startswith("../"): + paper_url = urllib.parse.urljoin(url, href) + elif href.startswith("/"): + paper_url = urllib.parse.urljoin(BASE_URL, href) + elif not href.startswith("http"): + paper_url = urllib.parse.urljoin(url, href) + else: + paper_url = href + + paper_id = match.group(1).lower() + file_ext = match.group(2).lower() + filename = match.group(0).lower() + + title = "" + if len(cells) > 1: + title = cells[1].text.strip() + + authors = [] + if len(cells) > 2: + authors_raw = cells[2].text.strip() + # Split by comma or 'and' if multiple + if authors_raw: + authors = [a.strip() for a in re.split(r",| and ", authors_raw) if a.strip()] + + document_date = None + if len(cells) > 3: + date_str = cells[3].text.strip() + if date_str: + document_date = date_str # Will be parsed/saved in pipeline + + subgroup = "" + if len(cells) > 4: + subgroup = cells[4].text.strip() + + paper_urls.append({ + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + }) + break # Only take the first paper link in the cell + + # Remove exact duplicates (same filename) + seen = set() + unique_papers = [] + for p in paper_urls: + if p["filename"] not in seen: + seen.add(p["filename"]) + unique_papers.append(p) + + return unique_papers diff --git a/wg21_paper_tracker/management/__init__.py b/wg21_paper_tracker/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/management/commands/__init__.py b/wg21_paper_tracker/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py new file mode 100644 index 0000000..9e06f09 --- /dev/null +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -0,0 +1,56 @@ +""" +Management command for WG21 Paper Tracker. +Runs the pipeline to fetch new mailings, download papers, upload to GCS, and update DB. +If new papers were found and uploaded, it triggers the Google Cloud Run conversion job. +""" + +import logging +import os +from django.core.management.base import BaseCommand +from django.conf import settings + +from wg21_paper_tracker.pipeline import run_tracker_pipeline + +logger = logging.getLogger(__name__) + +def trigger_cloud_run_job(project_id: str, location: str, job_name: str): + from google.cloud import run_v2 + client = run_v2.JobsClient() + name = client.job_path(project_id, location, job_name) + request = run_v2.RunJobRequest(name=name) + logger.info("Triggering Cloud Run job %s...", name) + operation = client.run_job(request=request) + logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) + return operation + +class Command(BaseCommand): + help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + + def handle(self, *args, **options): + logger.info("Starting WG21 Paper Tracker...") + + try: + total_new_papers = run_tracker_pipeline() + self.stdout.write(self.style.SUCCESS(f"Downloaded and uploaded {total_new_papers} new papers.")) + + if total_new_papers > 0: + project_id = settings.GCP_PROJECT_ID + location = settings.GCP_LOCATION + job_name = settings.WG21_CLOUD_RUN_JOB_NAME + + if project_id and job_name: + try: + trigger_cloud_run_job(project_id, location, job_name) + self.stdout.write(self.style.SUCCESS(f"Successfully triggered Cloud Run job {job_name}.")) + except Exception as e: + logger.error("Failed to trigger Cloud Run job: %s", e) + self.stderr.write(self.style.ERROR(f"Error triggering Cloud Run job: {e}")) + else: + logger.warning("GCP_PROJECT_ID not configured. Skipping Cloud Run trigger.") + self.stdout.write(self.style.WARNING("Skipping Cloud Run trigger (missing GCP config).")) + else: + self.stdout.write("No new papers found. Skipping Cloud Run job.") + + except Exception as e: + logger.exception("WG21 Paper Tracker failed: %s", e) + raise diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py new file mode 100644 index 0000000..01e7e58 --- /dev/null +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -0,0 +1,69 @@ +# Generated by Django 4.2.28 on 2026-03-09 15:35 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('cppa_user_tracker', '0005_alter_slackuser_slack_user_id'), + ] + + operations = [ + migrations.CreateModel( + name='WG21Mailing', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('mailing_date', models.CharField(db_index=True, max_length=7, unique=True)), + ('title', models.CharField(max_length=255)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('updated_at', models.DateTimeField(auto_now=True)), + ], + options={ + 'verbose_name': 'WG21 Mailing', + 'verbose_name_plural': 'WG21 Mailings', + 'db_table': 'wg21_paper_tracker_wg21mailing', + 'ordering': ['-mailing_date'], + }, + ), + migrations.CreateModel( + name='WG21Paper', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('paper_id', models.CharField(db_index=True, max_length=255, unique=True)), + ('url', models.URLField(max_length=1024)), + ('title', models.CharField(db_index=True, max_length=1024)), + ('document_date', models.DateField(blank=True, db_index=True, null=True)), + ('subgroup', models.CharField(blank=True, db_index=True, max_length=255)), + ('is_downloaded', models.BooleanField(db_index=True, default=False)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('updated_at', models.DateTimeField(auto_now=True)), + ('mailing', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='wg21_paper_tracker.wg21mailing')), + ], + options={ + 'verbose_name': 'WG21 Paper', + 'verbose_name_plural': 'WG21 Papers', + 'db_table': 'wg21_paper_tracker_wg21paper', + 'ordering': ['-document_date', '-paper_id'], + }, + ), + migrations.CreateModel( + name='WG21PaperAuthor', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('paper', models.ForeignKey(db_column='paper_id', on_delete=django.db.models.deletion.CASCADE, related_name='authors', to='wg21_paper_tracker.wg21paper')), + ('profile', models.ForeignKey(db_column='profile_id', on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='cppa_user_tracker.wg21paperauthorprofile')), + ], + options={ + 'verbose_name': 'WG21 Paper Author', + 'verbose_name_plural': 'WG21 Paper Authors', + 'db_table': 'wg21_paper_tracker_wg21paperauthor', + 'ordering': ['id'], + 'unique_together': {('paper', 'profile')}, + }, + ), + ] diff --git a/wg21_paper_tracker/migrations/__init__.py b/wg21_paper_tracker/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py new file mode 100644 index 0000000..9ae2d27 --- /dev/null +++ b/wg21_paper_tracker/models.py @@ -0,0 +1,76 @@ +""" +Models per docs/Schema.md section 7: WG21 Papers Tracker. +References cppa_user_tracker.WG21PaperAuthorProfile (section 1) as author. +""" + +from django.db import models + + +class WG21Mailing(models.Model): + """WG21 mailing release (mailing_date, title).""" + + mailing_date = models.CharField(max_length=7, unique=True, db_index=True) + title = models.CharField(max_length=255) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-mailing_date"] + verbose_name = "WG21 Mailing" + verbose_name_plural = "WG21 Mailings" + + def __str__(self): + return f"{self.mailing_date} ({self.title})" + + +class WG21Paper(models.Model): + """WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded).""" + + paper_id = models.CharField(max_length=255, unique=True, db_index=True) + url = models.URLField(max_length=1024) + title = models.CharField(max_length=1024, db_index=True) + document_date = models.DateField(db_index=True, null=True, blank=True) + mailing = models.ForeignKey( + WG21Mailing, + on_delete=models.CASCADE, + related_name="papers", + ) + subgroup = models.CharField(max_length=255, blank=True, db_index=True) + is_downloaded = models.BooleanField(default=False, db_index=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-document_date", "-paper_id"] + verbose_name = "WG21 Paper" + verbose_name_plural = "WG21 Papers" + + def __str__(self): + return f"{self.paper_id}: {self.title[:60]}" + + +class WG21PaperAuthor(models.Model): + """Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile).""" + + paper = models.ForeignKey( + WG21Paper, + on_delete=models.CASCADE, + related_name="authors", + db_column="paper_id", + ) + profile = models.ForeignKey( + "cppa_user_tracker.WG21PaperAuthorProfile", + on_delete=models.CASCADE, + related_name="papers", + db_column="profile_id", + ) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = (("paper", "profile"),) + ordering = ["id"] + verbose_name = "WG21 Paper Author" + verbose_name_plural = "WG21 Paper Authors" + + def __str__(self): + return f"{self.paper.paper_id} - {self.profile.display_name}" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py new file mode 100644 index 0000000..3c4146c --- /dev/null +++ b/wg21_paper_tracker/pipeline.py @@ -0,0 +1,174 @@ +""" +Pipeline for WG21 Paper Tracker. +Coordinates scraping, downloading, uploading to GCS, and updating the database. +""" + +import os +import requests +import logging +from pathlib import Path +from typing import Optional + +from django.conf import settings +from google.cloud import storage + +from wg21_paper_tracker.fetcher import fetch_all_mailings, fetch_papers_for_mailing +from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.services import get_or_create_mailing, get_or_create_paper +from wg21_paper_tracker.workspace import get_raw_dir + +logger = logging.getLogger(__name__) + +def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: str) -> bool: + """Uploads a file to the bucket.""" + try: + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + blob.upload_from_filename(str(source_path)) + logger.info("Uploaded %s to gs://%s/%s", source_path.name, bucket_name, destination_blob_name) + return True + except Exception as e: + logger.error("Failed to upload to GCS: %s", e) + return False + +def _download_file(url: str, filepath: Path) -> bool: + """Download file from URL to filepath.""" + try: + logger.info("Downloading %s to %s", url, filepath) + response = requests.get(url, timeout=60, stream=True) + response.raise_for_status() + + # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. + content_type = response.headers.get("content-type", "") + if "text" in content_type: + with open(filepath, "w", encoding="utf-8") as f: + f.write(response.content.decode(response.apparent_encoding or "utf-8", errors="replace")) + else: + with open(filepath, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + return True + except Exception as e: + logger.error("Failed to download %s: %s", url, e) + return False + +def run_tracker_pipeline() -> int: + """ + Run the WG21 tracker pipeline. + Returns the number of new papers downloaded and uploaded. + """ + bucket_name = settings.WG21_GCS_BUCKET + if not bucket_name: + logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") + + # 1. Get latest mailing from DB + latest_mailing = WG21Mailing.objects.order_by("-mailing_date").first() + latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01" + + # 2. Fetch all mailings + all_mailings = fetch_all_mailings() + if not all_mailings: + logger.warning("No mailings found on WG21 site.") + return 0 + + # Filter newer mailings + new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] + # Also check the latest one again just in case new papers were added + if latest_mailing and latest_mailing.mailing_date not in [m["mailing_date"] for m in new_mailings]: + # We re-check the most recent mailing from the DB to catch late additions + # Find the matching dict from all_mailings + current_m = next((m for m in all_mailings if m["mailing_date"] == latest_mailing.mailing_date), None) + if current_m: + new_mailings.append(current_m) + + # Sort chronologically (oldest to newest) + new_mailings.sort(key=lambda x: x["mailing_date"]) + + total_new_papers = 0 + + for m_info in new_mailings: + mailing_date = m_info["mailing_date"] + title = m_info["title"] + year = m_info["year"] + + # Create/get mailing in DB + mailing_obj, _ = get_or_create_mailing(mailing_date, title) + + # Fetch papers for this mailing + papers = fetch_papers_for_mailing(year, mailing_date) + if not papers: + continue + + # Group papers by ID to prioritize PDF over HTML + papers_by_id = {} + for p in papers: + pid = p["paper_id"] + if pid not in papers_by_id: + papers_by_id[pid] = [] + papers_by_id[pid].append(p) + + def format_priority(ext: str) -> int: + priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + return priorities.get(ext.lower(), 100) + + raw_dir = get_raw_dir(mailing_date) + + for pid, p_list in papers_by_id.items(): + # Check DB if this paper_id is already fully downloaded + existing_paper = WG21Paper.objects.filter(paper_id=pid).first() + if existing_paper and existing_paper.is_downloaded: + continue + + # Pick the best format + p_list.sort(key=lambda x: format_priority(x["type"])) + best_paper = p_list[0] + + filename = best_paper["filename"] + local_path = raw_dir / filename + url = best_paper["url"] + + # Download + if _download_file(url, local_path): + uploaded = False + if bucket_name: + gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" + uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) + else: + # If no GCS, simulate success so DB is updated + uploaded = True + + # Persist DB + doc_date_str = best_paper["document_date"] + # Parse date if available + from django.utils.dateparse import parse_date + doc_date = None + if doc_date_str: + try: + doc_date = parse_date(doc_date_str) + except: + pass + + paper_obj, created = get_or_create_paper( + paper_id=pid, + url=url, + title=best_paper["title"], + document_date=doc_date, + mailing=mailing_obj, + subgroup=best_paper["subgroup"], + author_names=best_paper["authors"], + ) + + if uploaded: + paper_obj.is_downloaded = True + paper_obj.save(update_fields=["is_downloaded"]) + total_new_papers += 1 + + # Clean up local file to save space + try: + local_path.unlink() + except Exception as e: + logger.warning("Could not delete temp file %s: %s", local_path, e) + + return total_new_papers diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py new file mode 100644 index 0000000..3679d3f --- /dev/null +++ b/wg21_paper_tracker/services.py @@ -0,0 +1,77 @@ +""" +Database logic for WG21 Paper Tracker. +""" + +from typing import Optional + +from django.db import transaction + +from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@transaction.atomic +def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: + mailing, created = WG21Mailing.objects.get_or_create( + mailing_date=mailing_date, + defaults={"title": title} + ) + if not created and mailing.title != title: + mailing.title = title + mailing.save(update_fields=["title", "updated_at"]) + return mailing, created + + +@transaction.atomic +def get_or_create_paper( + paper_id: str, + url: str, + title: str, + document_date: Optional[str], + mailing: WG21Mailing, + subgroup: str = "", + author_names: Optional[list[str]] = None, +) -> tuple[WG21Paper, bool]: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + } + ) + if not created: + updated = False + if paper.url != url: + paper.url = url + updated = True + if paper.title != title: + paper.title = title + updated = True + if paper.document_date != document_date: + paper.document_date = document_date + updated = True + if paper.mailing_id != mailing.id: + paper.mailing = mailing + updated = True + if paper.subgroup != subgroup: + paper.subgroup = subgroup + updated = True + if updated: + paper.save() + + if author_names: + for name in author_names: + profile, _ = get_or_create_wg21_paper_author_profile(name) + WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + ) + + return paper, created + + +def mark_paper_downloaded(paper_id: str): + WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py new file mode 100644 index 0000000..19c0d1b --- /dev/null +++ b/wg21_paper_tracker/workspace.py @@ -0,0 +1,23 @@ +""" +Workspace paths for wg21_paper_tracker. +Temporary file storage during download before uploading to GCS. +""" + +from pathlib import Path + +from config.workspace import get_workspace_path + +_APP_SLUG = "wg21_paper_tracker" +_RAW_APP_SLUG = f"raw/{_APP_SLUG}" + + +def get_workspace_root() -> Path: + return get_workspace_path(_APP_SLUG) + + +def get_raw_dir(mailing_date: str) -> Path: + """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" + raw_root = get_workspace_path(_RAW_APP_SLUG) + path = raw_root / mailing_date + path.mkdir(parents=True, exist_ok=True) + return path diff --git a/workflow/management/commands/run_all_collectors.py b/workflow/management/commands/run_all_collectors.py index 25b262b..c2c1fe4 100644 --- a/workflow/management/commands/run_all_collectors.py +++ b/workflow/management/commands/run_all_collectors.py @@ -20,6 +20,7 @@ "run_boost_usage_tracker", "run_boost_mailing_list_tracker", "run_discord_exporter", + "run_wg21_paper_tracker", ] From 9892a4574adabad90a532853cd7a2bbe08ba0bea Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 19:18:09 -0700 Subject: [PATCH 02/20] wg21_paper_tracker: features, tests, and cleanup #24 --- config/settings.py | 4 +- cppa_user_tracker/services.py | 12 +- parse_test.py | 8 - parse_year.py | 20 -- wg21_paper_tracker/admin.py | 12 +- .../converters/docling_converter.py | 1 + .../converters/openai_converter.py | 5 +- .../converters/pdfplumber_converter.py | 1 + wg21_paper_tracker/cloud_run_job/main.py | 26 +- wg21_paper_tracker/fetcher.py | 49 ++-- .../commands/import_wg21_metadata_from_csv.py | 249 ++++++++++++++++++ .../commands/run_wg21_paper_tracker.py | 53 +++- wg21_paper_tracker/migrations/0001_initial.py | 140 +++++++--- wg21_paper_tracker/models.py | 8 +- wg21_paper_tracker/pipeline.py | 168 +++++++++--- wg21_paper_tracker/services.py | 17 +- wg21_paper_tracker/tests/__init__.py | 1 + wg21_paper_tracker/tests/test_fetcher.py | 179 +++++++++++++ wg21_paper_tracker/tests/test_models.py | 76 ++++++ wg21_paper_tracker/tests/test_pipeline.py | 149 +++++++++++ wg21_paper_tracker/tests/test_services.py | 223 ++++++++++++++++ wg21_paper_tracker/tests/test_workspace.py | 73 +++++ 22 files changed, 1310 insertions(+), 164 deletions(-) delete mode 100644 parse_test.py delete mode 100644 parse_year.py create mode 100644 wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py create mode 100644 wg21_paper_tracker/tests/__init__.py create mode 100644 wg21_paper_tracker/tests/test_fetcher.py create mode 100644 wg21_paper_tracker/tests/test_models.py create mode 100644 wg21_paper_tracker/tests/test_pipeline.py create mode 100644 wg21_paper_tracker/tests/test_services.py create mode 100644 wg21_paper_tracker/tests/test_workspace.py diff --git a/config/settings.py b/config/settings.py index ae4a50f..925ebe5 100644 --- a/config/settings.py +++ b/config/settings.py @@ -220,7 +220,9 @@ WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "").strip() +WG21_CLOUD_RUN_JOB_NAME = ( + env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "" +).strip() # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 35503f4..35b4e31 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -50,9 +50,7 @@ def get_or_create_identity( """Get or create an Identity by display_name. If exists, updates description from defaults.""" lookup = {"display_name": display_name} defaults = defaults or {"description": description} - identity, created = Identity.objects.get_or_create( - defaults=defaults, **lookup - ) + identity, created = Identity.objects.get_or_create(defaults=defaults, **lookup) if ( not created and "description" in defaults @@ -265,9 +263,7 @@ def get_or_create_slack_user( raise ValueError("Slack user ID ('id') is required") profile = user_data.get("profile") or {} username = (user_data.get("name") or "").strip() - display_name = ( - user_data.get("real_name") or user_data.get("name") or "" - ).strip() + display_name = (user_data.get("real_name") or user_data.get("name") or "").strip() avatar_url = (profile.get("image_72") or "").strip() user, created = SlackUser.objects.get_or_create( slack_user_id=user_id, @@ -310,9 +306,7 @@ def get_or_create_unknown_github_account( ).first() if existing is not None: if email_str and not existing.emails.filter(email=email_str).exists(): - add_email( - existing, email_str, is_primary=not existing.emails.exists() - ) + add_email(existing, email_str, is_primary=not existing.emails.exists()) return existing, False next_id = _get_next_negative_github_account_id() account = get_or_create_github_account( diff --git a/parse_test.py b/parse_test.py deleted file mode 100644 index 19dd034..0000000 --- a/parse_test.py +++ /dev/null @@ -1,8 +0,0 @@ -import re -text = """- [2026-01 mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-01) -- [2026-02 pre-Croydon mailing](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/#mailing2026-02) -- [2026](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/) N5034-N????""" - -pattern = re.compile(r'\[([^\]]+)\]\([^#]+#mailing(\d{4}-\d{2})\)') -for m in pattern.finditer(text): - print(m.groups()) diff --git a/parse_year.py b/parse_year.py deleted file mode 100644 index 1f6277f..0000000 --- a/parse_year.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -response = requests.get('https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2026/') -soup = BeautifulSoup(response.text, 'html.parser') - -anchor = soup.find('a', id='mailing2026-02') -if not anchor: - print("Anchor not found") -else: - table = anchor.find_next('table') - rows = table.find_all('tr') - print(f"Found {len(rows)} rows in table after anchor") - for row in rows[:3]: - cells = [c.text.strip() for c in row.find_all(['th', 'td'])] - print(cells) - # Also print links in first cell - if row.find('td'): - links = row.find_all('td')[0].find_all('a') - print("Links:", [l['href'] for l in links]) diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py index a22358d..86784ae 100644 --- a/wg21_paper_tracker/admin.py +++ b/wg21_paper_tracker/admin.py @@ -17,9 +17,17 @@ class WG21PaperAuthorInline(admin.TabularInline): @admin.register(WG21Paper) class WG21PaperAdmin(admin.ModelAdmin): - list_display = ("paper_id", "title", "document_date", "mailing", "subgroup", "is_downloaded") + list_display = ( + "paper_id", + "year", + "title", + "document_date", + "mailing", + "subgroup", + "is_downloaded", + ) search_fields = ("paper_id", "title", "url", "subgroup") - list_filter = ("is_downloaded", "subgroup", "mailing") + list_filter = ("is_downloaded", "subgroup", "mailing", "year") ordering = ("-document_date", "-paper_id") inlines = [WG21PaperAuthorInline] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py index b113332..b9d6067 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional import logging + logger = logging.getLogger(__name__) try: diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 211ade7..cd168aa 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -2,15 +2,18 @@ OpenAI/OpenRouter-based PDF to Markdown converter with OCR. """ +import os import base64 from pathlib import Path from typing import Optional import requests import logging + logger = logging.getLogger(__name__) # Base configuration fallback -import os + + OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 31073d1..58a1465 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Optional import logging + logger = logging.getLogger(__name__) try: diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index 952124b..cf704ae 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -8,11 +8,14 @@ from converters.pdfplumber_converter import convert_with_pdfplumber from converters.openai_converter import convert_with_openai -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) MIN_CONTENT_LENGTH = 50 + def is_content_valid(content: str) -> bool: if not content: return False @@ -20,7 +23,13 @@ def is_content_valid(content: str) -> bool: if len(content_stripped) < MIN_CONTENT_LENGTH: return False error_patterns = [ - "traceback", "exception:", "error:", "failed to", "unable to convert", "conversion failed", "error processing" + "traceback", + "exception:", + "error:", + "failed to", + "unable to convert", + "conversion failed", + "error processing", ] content_lower = content_stripped.lower() first_part = content_lower[:1000] @@ -33,6 +42,7 @@ def is_content_valid(content: str) -> bool: return False return True + def convert_pdf_to_md(pdf_path: Path) -> str: logger.info("Attempting Docling conversion...") content = convert_with_docling(pdf_path) @@ -48,9 +58,10 @@ def convert_pdf_to_md(pdf_path: Path) -> str: content = convert_with_openai(pdf_path) if is_content_valid(content): return content - + return "" + def main(): bucket_name = os.getenv("WG21_GCS_BUCKET") if not bucket_name: @@ -64,14 +75,14 @@ def main(): converted_prefix = "converted/wg21_papers/" blobs = client.list_blobs(bucket, prefix=raw_prefix) - + with tempfile.TemporaryDirectory() as tmpdir: for blob in blobs: if not blob.name.lower().endswith(".pdf"): continue - + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix):] + relative_path = blob.name[len(raw_prefix) :] md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" md_blob_name = f"{converted_prefix}{md_relative_path}" @@ -92,9 +103,10 @@ def main(): logger.info("Successfully converted and uploaded %s", md_blob_name) else: logger.error("Failed to convert %s", blob.name) - + if local_pdf_path.exists(): local_pdf_path.unlink() + if __name__ == "__main__": main() diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index e254223..e733e83 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -5,13 +5,12 @@ import re import urllib.parse -from typing import Optional import requests from bs4 import BeautifulSoup -from django.utils.dateparse import parse_date import logging + logger = logging.getLogger(__name__) BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" @@ -39,22 +38,20 @@ def fetch_all_mailings() -> list[dict]: # Let's parse with BeautifulSoup soup = BeautifulSoup(response.text, "html.parser") mailings = [] - + # We look for links pointing to year/#mailingYYYY-MM pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$") - + for a in soup.find_all("a", href=True): href = a["href"] match = pattern.search(href) if match: year, mailing_date = match.groups() title = a.text.strip() - mailings.append({ - "mailing_date": mailing_date, - "title": title, - "year": year - }) - + mailings.append( + {"mailing_date": mailing_date, "title": title, "year": year} + ) + return mailings @@ -74,7 +71,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: soup = BeautifulSoup(response.text, "html.parser") anchor_id = f"mailing{mailing_date}" - anchor = soup.find(id=anchor_id) + anchor = soup.find(id=anchor_id) or soup.find(attrs={"name": anchor_id}) if not anchor: logger.warning("Anchor %s not found on %s", anchor_id, url) return [] @@ -115,13 +112,17 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: title = "" if len(cells) > 1: title = cells[1].text.strip() - + authors = [] if len(cells) > 2: authors_raw = cells[2].text.strip() # Split by comma or 'and' if multiple if authors_raw: - authors = [a.strip() for a in re.split(r",| and ", authors_raw) if a.strip()] + authors = [ + a.strip() + for a in re.split(r",| and ", authors_raw) + if a.strip() + ] document_date = None if len(cells) > 3: @@ -133,16 +134,18 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: if len(cells) > 4: subgroup = cells[4].text.strip() - paper_urls.append({ - "url": paper_url, - "filename": filename, - "type": file_ext, - "paper_id": paper_id, - "title": title, - "authors": authors, - "document_date": document_date, - "subgroup": subgroup, - }) + paper_urls.append( + { + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + } + ) break # Only take the first paper link in the cell # Remove exact duplicates (same filename) diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py new file mode 100644 index 0000000..5d4a398 --- /dev/null +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -0,0 +1,249 @@ +""" +Management command: import_wg21_metadata_from_csv + +Reads workspace/wg21_paper_tracker/metadata.csv (or a given path) and fills +WG21Mailing, WG21Paper, and WG21PaperAuthor using get_or_create_mailing and +get_or_create_paper. Handles missing mailing_date via a placeholder mailing +(unknown / Unknown). +""" + +import csv +import logging +import re +from pathlib import Path + +from django.core.management.base import BaseCommand +from django.db import IntegrityError +from django.utils.dateparse import parse_date + +from wg21_paper_tracker.models import WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, +) +from wg21_paper_tracker.workspace import get_workspace_root + +logger = logging.getLogger(__name__) + +MAILING_DATE_PATTERN = re.compile(r"^\d{4}-\d{2}$") +TITLE_MAX_LENGTH = 1024 +PLACEHOLDER_MAILING_DATE = "unknown" +PLACEHOLDER_MAILING_TITLE = "Unknown" + + +def _norm(s: str) -> str: + """Return the string stripped of leading/trailing whitespace, or empty string if None.""" + return (s or "").strip() + + +def _normalize_title(raw: str) -> str: + """Replace internal newlines with space and truncate to model max_length.""" + if not raw: + return "" + one_line = " ".join(raw.split()) + return ( + one_line[:TITLE_MAX_LENGTH] + if len(one_line) > TITLE_MAX_LENGTH + else one_line + ) + + +def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]: + """ + Return (mailing_date, title) for this row. + If CSV mailing_date is non-empty and YYYY-MM, use it with synthetic title. + Otherwise use placeholder mailing_date="unknown", title="Unknown". + """ + cleaned = _norm(csv_mailing_date) + if cleaned and MAILING_DATE_PATTERN.match(cleaned): + return cleaned, f"{cleaned} (from metadata)" + return PLACEHOLDER_MAILING_DATE, PLACEHOLDER_MAILING_TITLE + + +def _parse_document_date(date_str: str): + """Return date or None from CSV date column (e.g. YYYY-MM-DD). Invalid values return None.""" + cleaned = _norm(date_str) + if not cleaned: + return None + # try: + return parse_date(cleaned) + # except (ValueError, TypeError): + # return None + + +def _author_names_from_csv(author_str: str) -> list[str]: + """Split author column by comma, strip each, drop empty.""" + cleaned = _norm(author_str) + if not cleaned: + return [] + return [a.strip() for a in cleaned.split(",") if a.strip()] + + +def _read_csv_rows(csv_path: Path): + """Yield dicts for each row with normalized keys and values.""" + with open(csv_path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + out = {} + for k, v in row.items(): + if k is None: + continue + key = k.strip().lower() + out[key] = _norm(v) if v is not None else "" + # Normalize title (multi-line -> single line, truncate) + if "title" in out: + out["title"] = _normalize_title(out["title"]) + yield out + + +class Command(BaseCommand): + help = ( + "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). " + "CSV columns: filename, paper_id, url, title, author, date, mailing_date, subgroup. " + "When mailing_date is empty, papers are linked to a single 'unknown' mailing." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--csv-file", + type=Path, + default=None, + help="Path to metadata CSV (default: workspace/wg21_paper_tracker/metadata.csv)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only read CSV and report what would be done; do not write to DB.", + ) + + def handle(self, *args, **options): + csv_path = options.get("csv_file") or ( + get_workspace_root() / "metadata.csv" + ) + dry_run = options["dry_run"] + + if not csv_path.exists(): + logger.error("File not found: %s", csv_path) + return + + if dry_run: + logger.info("Dry run: no DB writes.") + + stats = { + "rows": 0, + "skipped": 0, + "mailings_created": 0, + "papers_created": 0, + "papers_updated": 0, + } + + for row in _read_csv_rows(csv_path): + stats["rows"] += 1 + paper_id = (row.get("paper_id", "") or "").strip().lower() + url = row.get("url", "") + document_date = row.get("date", "") + + if not paper_id or not url: + stats["skipped"] += 1 + if stats["skipped"] <= 5: + logger.debug( + "Skipping row: missing paper_id or url: %s", + row.get("paper_id", "") or row.get("url", "")[:50], + ) + continue + + mailing_date, mailing_title = _resolve_mailing_date( + row.get("mailing_date", "") + ) + year_str = ( + mailing_date[:4] + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date) + else (document_date[:4] if document_date else None) + ) + year = int(year_str) if year_str and year_str.isdigit() else None + try: + document_date = _parse_document_date(row.get("date", "")) + title = row.get("title", "") or paper_id + subgroup = row.get("subgroup", "") + author_names = _author_names_from_csv(row.get("author", "")) + except Exception as e: + stats["skipped"] += 1 + logger.error( + "Error parsing document date for paper_id=%s: %s", + paper_id, + e, + ) + continue + + if dry_run: + logger.info( + "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", + paper_id, + mailing_date, + document_date, + len(author_names), + ) + continue + + try: + mailing, mailing_created = get_or_create_mailing( + mailing_date, mailing_title + ) + if mailing_created: + stats["mailings_created"] += 1 + + paper, paper_created = get_or_create_paper( + paper_id=paper_id, + url=url, + title=title, + document_date=document_date, + mailing=mailing, + subgroup=subgroup, + author_names=author_names if author_names else None, + year=year, + ) + if paper_created: + stats["papers_created"] += 1 + else: + stats["papers_updated"] += 1 + except IntegrityError as e: + # Duplicate (paper_id) or (paper_id, year): fetch existing and update + stats["papers_updated"] += 1 + try: + paper = WG21Paper.objects.filter(paper_id=paper_id).first() + if paper is None: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", paper_id, e) + else: + paper.url = url + paper.title = title + paper.document_date = document_date + paper.mailing = mailing + paper.subgroup = subgroup + if year is not None: + paper.year = year + paper.save() + if author_names: + from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + for name in author_names: + profile, _ = get_or_create_wg21_paper_author_profile(name) + WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + ) + except Exception as inner: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s (after IntegrityError): %s", paper_id, inner) + except Exception as e: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", paper_id, e) + + logger.info( + "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d", + stats["rows"], + stats["skipped"], + stats["mailings_created"], + stats["papers_created"], + stats["papers_updated"], + ) + logger.info("Done.") diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index 9e06f09..f771043 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -5,7 +5,6 @@ """ import logging -import os from django.core.management.base import BaseCommand from django.conf import settings @@ -13,8 +12,17 @@ logger = logging.getLogger(__name__) + def trigger_cloud_run_job(project_id: str, location: str, job_name: str): + """ + Start the named Cloud Run job (run once, no polling). + + Uses the Cloud Run v2 API to trigger the job identified by project_id, + location, and job_name. The job runs asynchronously; this function returns + the operation and does not wait for the job to finish. + """ from google.cloud import run_v2 + client = run_v2.JobsClient() name = client.job_path(project_id, location, job_name) request = run_v2.RunJobRequest(name=name) @@ -23,16 +31,39 @@ def trigger_cloud_run_job(project_id: str, location: str, job_name: str): logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) return operation + class Command(BaseCommand): + """Run WG21 paper tracker and optionally trigger the Cloud Run conversion job.""" + help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + def add_arguments(self, parser): + """Register --dry-run so the command can skip pipeline and Cloud Run.""" + parser.add_argument( + "--dry-run", + action="store_true", + help="Only log what would be done; do not run the pipeline or trigger Cloud Run.", + ) + def handle(self, *args, **options): + """ + Run the tracker pipeline; if new papers were uploaded, trigger the Cloud Run job. + + With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. + Otherwise runs the pipeline, then triggers the configured Cloud Run job when + total_new_papers > 0 and GCP_PROJECT_ID and WG21_CLOUD_RUN_JOB_NAME are set. + """ + dry_run = options.get("dry_run", False) + if dry_run: + logger.info("Dry run: skipping pipeline and Cloud Run trigger.") + return + logger.info("Starting WG21 Paper Tracker...") - + try: total_new_papers = run_tracker_pipeline() - self.stdout.write(self.style.SUCCESS(f"Downloaded and uploaded {total_new_papers} new papers.")) - + logger.info("Downloaded and uploaded %d new papers.", total_new_papers) + if total_new_papers > 0: project_id = settings.GCP_PROJECT_ID location = settings.GCP_LOCATION @@ -41,16 +72,18 @@ def handle(self, *args, **options): if project_id and job_name: try: trigger_cloud_run_job(project_id, location, job_name) - self.stdout.write(self.style.SUCCESS(f"Successfully triggered Cloud Run job {job_name}.")) + logger.info( + "Successfully triggered Cloud Run job %s.", job_name + ) except Exception as e: logger.error("Failed to trigger Cloud Run job: %s", e) - self.stderr.write(self.style.ERROR(f"Error triggering Cloud Run job: {e}")) else: - logger.warning("GCP_PROJECT_ID not configured. Skipping Cloud Run trigger.") - self.stdout.write(self.style.WARNING("Skipping Cloud Run trigger (missing GCP config).")) + logger.warning( + "GCP_PROJECT_ID not configured. Skipping Cloud Run trigger." + ) else: - self.stdout.write("No new papers found. Skipping Cloud Run job.") - + logger.info("No new papers found. Skipping Cloud Run job.") + except Exception as e: logger.exception("WG21 Paper Tracker failed: %s", e) raise diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index 01e7e58..b4f9635 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.28 on 2026-03-09 15:35 +# Merged initial migration: WG21 Mailing, WG21 Paper (with year), WG21 Paper Author from django.db import migrations, models import django.db.models.deletion @@ -9,61 +9,127 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ('cppa_user_tracker', '0005_alter_slackuser_slack_user_id'), + ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), ] operations = [ migrations.CreateModel( - name='WG21Mailing', + name="WG21Mailing", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('mailing_date', models.CharField(db_index=True, max_length=7, unique=True)), - ('title', models.CharField(max_length=255)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "mailing_date", + models.CharField(db_index=True, max_length=7, unique=True), + ), + ("title", models.CharField(max_length=255)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), ], options={ - 'verbose_name': 'WG21 Mailing', - 'verbose_name_plural': 'WG21 Mailings', - 'db_table': 'wg21_paper_tracker_wg21mailing', - 'ordering': ['-mailing_date'], + "verbose_name": "WG21 Mailing", + "verbose_name_plural": "WG21 Mailings", + "db_table": "wg21_paper_tracker_wg21mailing", + "ordering": ["-mailing_date"], }, ), migrations.CreateModel( - name='WG21Paper', + name="WG21Paper", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('paper_id', models.CharField(db_index=True, max_length=255, unique=True)), - ('url', models.URLField(max_length=1024)), - ('title', models.CharField(db_index=True, max_length=1024)), - ('document_date', models.DateField(blank=True, db_index=True, null=True)), - ('subgroup', models.CharField(blank=True, db_index=True, max_length=255)), - ('is_downloaded', models.BooleanField(db_index=True, default=False)), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('updated_at', models.DateTimeField(auto_now=True)), - ('mailing', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='wg21_paper_tracker.wg21mailing')), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("paper_id", models.CharField(db_index=True, max_length=255)), + ("url", models.URLField(max_length=1024)), + ("title", models.CharField(db_index=True, max_length=1024)), + ( + "document_date", + models.DateField(blank=True, db_index=True, null=True), + ), + ( + "year", + models.IntegerField(blank=True, db_index=True, null=True), + ), + ( + "subgroup", + models.CharField( + blank=True, db_index=True, max_length=255 + ), + ), + ( + "is_downloaded", + models.BooleanField(db_index=True, default=False), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "mailing", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="wg21_paper_tracker.wg21mailing", + ), + ), ], options={ - 'verbose_name': 'WG21 Paper', - 'verbose_name_plural': 'WG21 Papers', - 'db_table': 'wg21_paper_tracker_wg21paper', - 'ordering': ['-document_date', '-paper_id'], + "verbose_name": "WG21 Paper", + "verbose_name_plural": "WG21 Papers", + "db_table": "wg21_paper_tracker_wg21paper", + "ordering": ["-document_date", "-paper_id", "-year"], + "unique_together": {("paper_id", "year")}, }, ), migrations.CreateModel( - name='WG21PaperAuthor', + name="WG21PaperAuthor", fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('created_at', models.DateTimeField(auto_now_add=True)), - ('paper', models.ForeignKey(db_column='paper_id', on_delete=django.db.models.deletion.CASCADE, related_name='authors', to='wg21_paper_tracker.wg21paper')), - ('profile', models.ForeignKey(db_column='profile_id', on_delete=django.db.models.deletion.CASCADE, related_name='papers', to='cppa_user_tracker.wg21paperauthorprofile')), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "paper", + models.ForeignKey( + db_column="paper_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="authors", + to="wg21_paper_tracker.wg21paper", + ), + ), + ( + "profile", + models.ForeignKey( + db_column="profile_id", + on_delete=django.db.models.deletion.CASCADE, + related_name="papers", + to="cppa_user_tracker.wg21paperauthorprofile", + ), + ), ], options={ - 'verbose_name': 'WG21 Paper Author', - 'verbose_name_plural': 'WG21 Paper Authors', - 'db_table': 'wg21_paper_tracker_wg21paperauthor', - 'ordering': ['id'], - 'unique_together': {('paper', 'profile')}, + "verbose_name": "WG21 Paper Author", + "verbose_name_plural": "WG21 Paper Authors", + "db_table": "wg21_paper_tracker_wg21paperauthor", + "ordering": ["id"], + "unique_together": {("paper", "profile")}, }, ), ] diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py index 9ae2d27..44754b4 100644 --- a/wg21_paper_tracker/models.py +++ b/wg21_paper_tracker/models.py @@ -24,12 +24,13 @@ def __str__(self): class WG21Paper(models.Model): - """WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded).""" + """WG21 paper (paper_id, url, title, document_date, year, mailing, subgroup, is_downloaded).""" - paper_id = models.CharField(max_length=255, unique=True, db_index=True) + paper_id = models.CharField(max_length=255, db_index=True) url = models.URLField(max_length=1024) title = models.CharField(max_length=1024, db_index=True) document_date = models.DateField(db_index=True, null=True, blank=True) + year = models.IntegerField(null=True, blank=True, db_index=True) mailing = models.ForeignKey( WG21Mailing, on_delete=models.CASCADE, @@ -41,7 +42,8 @@ class WG21Paper(models.Model): updated_at = models.DateTimeField(auto_now=True) class Meta: - ordering = ["-document_date", "-paper_id"] + unique_together = (("paper_id", "year"),) + ordering = ["-document_date", "-paper_id", "-year"] verbose_name = "WG21 Paper" verbose_name_plural = "WG21 Papers" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 3c4146c..edcf003 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -3,23 +3,35 @@ Coordinates scraping, downloading, uploading to GCS, and updating the database. """ -import os +import time import requests import logging from pathlib import Path -from typing import Optional from django.conf import settings from google.cloud import storage -from wg21_paper_tracker.fetcher import fetch_all_mailings, fetch_papers_for_mailing +from wg21_paper_tracker.fetcher import ( + fetch_all_mailings, + fetch_papers_for_mailing, +) from wg21_paper_tracker.models import WG21Mailing, WG21Paper -from wg21_paper_tracker.services import get_or_create_mailing, get_or_create_paper +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, +) from wg21_paper_tracker.workspace import get_raw_dir logger = logging.getLogger(__name__) -def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: str) -> bool: +DOWNLOAD_TIMEOUT = 30 +DOWNLOAD_MAX_RETRIES = 3 +DOWNLOAD_RETRY_DELAY = 2 + + +def _upload_to_gcs( + bucket_name: str, source_path: Path, destination_blob_name: str +) -> bool: """Uploads a file to the bucket.""" try: storage_client = storage.Client() @@ -27,32 +39,67 @@ def _upload_to_gcs(bucket_name: str, source_path: Path, destination_blob_name: s blob = bucket.blob(destination_blob_name) blob.upload_from_filename(str(source_path)) - logger.info("Uploaded %s to gs://%s/%s", source_path.name, bucket_name, destination_blob_name) + logger.info( + "Uploaded %s to gs://%s/%s", + source_path.name, + bucket_name, + destination_blob_name, + ) return True except Exception as e: logger.error("Failed to upload to GCS: %s", e) return False + def _download_file(url: str, filepath: Path) -> bool: - """Download file from URL to filepath.""" - try: - logger.info("Downloading %s to %s", url, filepath) - response = requests.get(url, timeout=60, stream=True) - response.raise_for_status() - - # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. - content_type = response.headers.get("content-type", "") - if "text" in content_type: - with open(filepath, "w", encoding="utf-8") as f: - f.write(response.content.decode(response.apparent_encoding or "utf-8", errors="replace")) - else: - with open(filepath, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - return True - except Exception as e: - logger.error("Failed to download %s: %s", url, e) - return False + """Download file from URL to filepath with retries and 30s timeout.""" + for attempt in range(1, DOWNLOAD_MAX_RETRIES + 1): + try: + logger.info( + "Downloading %s to %s (attempt %d/%d)", + url, + filepath, + attempt, + DOWNLOAD_MAX_RETRIES, + ) + response = requests.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True) + response.raise_for_status() + + # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. + content_type = response.headers.get("content-type", "") + if "text" in content_type: + with open(filepath, "w", encoding="utf-8") as f: + f.write( + response.content.decode( + response.apparent_encoding or "utf-8", + errors="replace", + ) + ) + else: + with open(filepath, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + return True + except Exception as e: + if attempt < DOWNLOAD_MAX_RETRIES: + logger.warning( + "Download attempt %d/%d failed for %s: %s. Retrying in %ds.", + attempt, + DOWNLOAD_MAX_RETRIES, + url, + e, + DOWNLOAD_RETRY_DELAY, + ) + time.sleep(DOWNLOAD_RETRY_DELAY) + else: + logger.error( + "Failed to download %s after %d attempts: %s", + url, + DOWNLOAD_MAX_RETRIES, + e, + ) + return False + def run_tracker_pipeline() -> int: """ @@ -64,7 +111,11 @@ def run_tracker_pipeline() -> int: logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") # 1. Get latest mailing from DB - latest_mailing = WG21Mailing.objects.order_by("-mailing_date").first() + latest_mailing = ( + WG21Mailing.objects.exclude(mailing_date="unknown") + .order_by("-mailing_date") + .first() + ) latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01" # 2. Fetch all mailings @@ -76,22 +127,37 @@ def run_tracker_pipeline() -> int: # Filter newer mailings new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] # Also check the latest one again just in case new papers were added - if latest_mailing and latest_mailing.mailing_date not in [m["mailing_date"] for m in new_mailings]: + if latest_mailing and latest_mailing.mailing_date not in [ + m["mailing_date"] for m in new_mailings + ]: # We re-check the most recent mailing from the DB to catch late additions # Find the matching dict from all_mailings - current_m = next((m for m in all_mailings if m["mailing_date"] == latest_mailing.mailing_date), None) + current_m = next( + ( + m + for m in all_mailings + if m["mailing_date"] == latest_mailing.mailing_date + ), + None, + ) if current_m: new_mailings.append(current_m) # Sort chronologically (oldest to newest) new_mailings.sort(key=lambda x: x["mailing_date"]) + logger.info( + "Pipeline: latest_date=%s, all_mailings=%d, mailings_to_process=%s", + latest_date, + len(all_mailings), + [m["mailing_date"] for m in new_mailings], + ) total_new_papers = 0 for m_info in new_mailings: mailing_date = m_info["mailing_date"] title = m_info["title"] - year = m_info["year"] + year = int(m_info["year"]) if m_info["year"] else None # Create/get mailing in DB mailing_obj, _ = get_or_create_mailing(mailing_date, title) @@ -99,12 +165,16 @@ def run_tracker_pipeline() -> int: # Fetch papers for this mailing papers = fetch_papers_for_mailing(year, mailing_date) if not papers: + logger.info( + "Mailing %s: no papers found (anchor/table may be missing).", + mailing_date, + ) continue - # Group papers by ID to prioritize PDF over HTML + # Group papers by ID to prioritize PDF over HTML (paper_id is case-insensitive) papers_by_id = {} for p in papers: - pid = p["paper_id"] + pid = (p["paper_id"] or "").strip().lower() if pid not in papers_by_id: papers_by_id[pid] = [] papers_by_id[pid].append(p) @@ -115,10 +185,12 @@ def format_priority(ext: str) -> int: raw_dir = get_raw_dir(mailing_date) + skipped_downloaded = 0 for pid, p_list in papers_by_id.items(): # Check DB if this paper_id is already fully downloaded existing_paper = WG21Paper.objects.filter(paper_id=pid).first() if existing_paper and existing_paper.is_downloaded: + skipped_downloaded += 1 continue # Pick the best format @@ -138,19 +210,25 @@ def format_priority(ext: str) -> int: else: # If no GCS, simulate success so DB is updated uploaded = True - + # Persist DB doc_date_str = best_paper["document_date"] # Parse date if available from django.utils.dateparse import parse_date + doc_date = None if doc_date_str: try: doc_date = parse_date(doc_date_str) - except: - pass + except Exception as e: + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + doc_date = None - paper_obj, created = get_or_create_paper( + paper_obj, _created = get_or_create_paper( paper_id=pid, url=url, title=best_paper["title"], @@ -158,17 +236,27 @@ def format_priority(ext: str) -> int: mailing=mailing_obj, subgroup=best_paper["subgroup"], author_names=best_paper["authors"], + year=year, ) - + if uploaded: paper_obj.is_downloaded = True paper_obj.save(update_fields=["is_downloaded"]) total_new_papers += 1 # Clean up local file to save space - try: - local_path.unlink() - except Exception as e: - logger.warning("Could not delete temp file %s: %s", local_path, e) + # try: + # # local_path.unlink() + # except Exception as e: + # logger.warning( + # "Could not delete temp file %s: %s", local_path, e + # ) + + if skipped_downloaded: + logger.info( + "Mailing %s: skipped %d papers (already downloaded).", + mailing_date, + skipped_downloaded, + ) return total_new_papers diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 3679d3f..cf846b0 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -13,8 +13,7 @@ @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: mailing, created = WG21Mailing.objects.get_or_create( - mailing_date=mailing_date, - defaults={"title": title} + mailing_date=mailing_date, defaults={"title": title} ) if not created and mailing.title != title: mailing.title = title @@ -31,16 +30,24 @@ def get_or_create_paper( mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, + year: int | None = None, ) -> tuple[WG21Paper, bool]: + paper_id = (paper_id or "").strip().lower() + year_val = None + if year: + s = (year if isinstance(year, str) else str(year)).strip()[:4] + if s.isdigit(): + year_val = int(s) paper, created = WG21Paper.objects.get_or_create( paper_id=paper_id, + year=year_val, defaults={ "url": url, "title": title, "document_date": document_date, "mailing": mailing, "subgroup": subgroup, - } + }, ) if not created: updated = False @@ -59,6 +66,9 @@ def get_or_create_paper( if paper.subgroup != subgroup: paper.subgroup = subgroup updated = True + if year_val is not None and paper.year != year_val: + paper.year = year_val + updated = True if updated: paper.save() @@ -74,4 +84,5 @@ def get_or_create_paper( def mark_paper_downloaded(paper_id: str): + paper_id = (paper_id or "").strip().lower() WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) diff --git a/wg21_paper_tracker/tests/__init__.py b/wg21_paper_tracker/tests/__init__.py new file mode 100644 index 0000000..18e481d --- /dev/null +++ b/wg21_paper_tracker/tests/__init__.py @@ -0,0 +1 @@ +# Tests for wg21_paper_tracker app (excluding cloud_run_job). diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py new file mode 100644 index 0000000..a06317a --- /dev/null +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -0,0 +1,179 @@ +"""Tests for wg21_paper_tracker.fetcher.""" + +from unittest.mock import patch, MagicMock + +import pytest + +from wg21_paper_tracker.fetcher import ( + BASE_URL, + fetch_all_mailings, + fetch_papers_for_mailing, +) + + +# --- fetch_all_mailings --- + + +def test_fetch_all_mailings_returns_empty_on_request_failure(): + """fetch_all_mailings returns [] when requests.get raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = Exception("network error") + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_returns_empty_on_http_error(): + """fetch_all_mailings returns [] when response.raise_for_status raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.raise_for_status.side_effect = Exception("404") + m.return_value = resp + result = fetch_all_mailings() + assert result == [] + + +def test_fetch_all_mailings_parses_links(): + """fetch_all_mailings parses year/#mailingYYYY-MM links and returns mailings.""" + html = """ + + 2025-01 pre-meeting mailing + 2025-02 post-meeting mailing + 2024-11 mailing + Ignore + + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_all_mailings() + assert len(result) == 3 + assert result[0]["mailing_date"] == "2025-01" + assert result[0]["title"] == "2025-01 pre-meeting mailing" + assert result[0]["year"] == "2025" + assert result[1]["mailing_date"] == "2025-02" + assert result[2]["mailing_date"] == "2024-11" + assert result[2]["year"] == "2024" + + +def test_fetch_all_mailings_calls_index_url(): + """fetch_all_mailings calls BASE_URL/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + fetch_all_mailings() + m.assert_called_once_with(f"{BASE_URL}/", timeout=30) + + +# --- fetch_papers_for_mailing --- + + +def test_fetch_papers_for_mailing_returns_empty_on_request_failure(): + """fetch_papers_for_mailing returns [] when requests.get raises.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.side_effect = Exception("timeout") + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_returns_empty_when_anchor_missing(): + """fetch_papers_for_mailing returns [] when mailing anchor is not found.""" + html = "
x
" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_finds_anchor_by_id(): + """fetch_papers_for_mailing finds anchor by id=mailingYYYY-MM.""" + html = """ + + + + +
p1000r0.pdfTitleAuthor2025-01-15SG1
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "p1000r0" + assert result[0]["filename"] == "p1000r0.pdf" + assert result[0]["title"] == "Title" + assert result[0]["authors"] == ["Author"] + assert result[0]["document_date"] == "2025-01-15" + assert result[0]["subgroup"] == "SG1" + + +def test_fetch_papers_for_mailing_finds_anchor_by_name(): + """fetch_papers_for_mailing finds anchor by name= when id is missing.""" + html = """ + + + + +
n5034.htmlDraft
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert len(result) == 1 + assert result[0]["paper_id"] == "n5034" + assert result[0]["type"] == "html" + + +def test_fetch_papers_for_mailing_normalizes_paper_id_lowercase(): + """fetch_papers_for_mailing returns paper_id in lowercase.""" + html = """ + + + + +
P3039R1.PDF
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result[0]["paper_id"] == "p3039r1" + assert result[0]["filename"] == "p3039r1.pdf" + + +def test_fetch_papers_for_mailing_returns_empty_when_no_table(): + """fetch_papers_for_mailing returns [] when no table follows anchor.""" + html = """ + + +

No table here

+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + result = fetch_papers_for_mailing("2025", "2025-01") + assert result == [] + + +def test_fetch_papers_for_mailing_calls_year_url(): + """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + fetch_papers_for_mailing("2025", "2025-01") + m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py new file mode 100644 index 0000000..ca92819 --- /dev/null +++ b/wg21_paper_tracker/tests/test_models.py @@ -0,0 +1,76 @@ +"""Tests for wg21_paper_tracker.models.""" + +from datetime import date + +import pytest + +from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor + + +@pytest.mark.django_db +def test_wg21_mailing_str(): + """WG21Mailing.__str__ returns mailing_date and title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="2025-01 pre-meeting") + assert str(m) == "2025-01 (2025-01 pre-meeting)" + + +@pytest.mark.django_db +def test_wg21_paper_str(): + """WG21Paper.__str__ returns paper_id and truncated title.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + p = WG21Paper.objects.create( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="A short title", + document_date=date(2025, 1, 15), + mailing=m, + year=2025, + ) + assert "p1000r0" in str(p) + assert "A short title" in str(p) + + +@pytest.mark.django_db +def test_wg21_paper_str_truncates_long_title(): + """WG21Paper.__str__ truncates title to 60 chars.""" + m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title") + long_title = "x" * 100 + p = WG21Paper.objects.create( + paper_id="p1", + url="https://example.com/p.pdf", + title=long_title, + mailing=m, + year=2025, + ) + assert len(str(p).split(": ", 1)[-1]) <= 60 + + +@pytest.mark.django_db +def test_wg21_mailing_ordering(): + """WG21Mailing default ordering is by mailing_date descending.""" + WG21Mailing.objects.create(mailing_date="2025-01", title="A") + WG21Mailing.objects.create(mailing_date="2025-02", title="B") + dates = list(WG21Mailing.objects.values_list("mailing_date", flat=True)) + assert dates == ["2025-02", "2025-01"] + + +@pytest.mark.django_db +def test_wg21_paper_unique_together_paper_id_year(): + """WG21Paper allows same paper_id with different year.""" + m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1") + m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2") + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/1.pdf", + title="T1", + mailing=m1, + year=2024, + ) + p2 = WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/2.pdf", + title="T2", + mailing=m2, + year=2025, + ) + assert p2.pk is not None diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py new file mode 100644 index 0000000..e052ce9 --- /dev/null +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -0,0 +1,149 @@ +"""Tests for wg21_paper_tracker.pipeline.""" + +import time +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +from wg21_paper_tracker.pipeline import ( + DOWNLOAD_TIMEOUT, + DOWNLOAD_MAX_RETRIES, + _download_file, + run_tracker_pipeline, +) + + +# --- _download_file --- + + +def test_download_file_success_text(tmp_path): + """_download_file saves text response and returns True.""" + url = "https://example.com/doc.html" + filepath = tmp_path / "doc.html" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/html; charset=utf-8"} + resp.content = b"Hello" + resp.apparent_encoding = "utf-8" + resp.iter_content = None + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): + result = _download_file(url, filepath) + assert result is True + assert filepath.read_text(encoding="utf-8") == "Hello" + + +def test_download_file_success_binary(tmp_path): + """_download_file saves binary response and returns True.""" + url = "https://example.com/doc.pdf" + filepath = tmp_path / "doc.pdf" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "application/pdf"} + resp.iter_content = lambda chunk_size: (b"\x25\x50\x44\x46",) + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): + result = _download_file(url, filepath) + assert result is True + assert filepath.read_bytes() == b"\x25\x50\x44\x46" + + +def test_download_file_uses_timeout(): + """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" + url = "https://example.com/f" + filepath = Path("/tmp/out") + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/plain"} + resp.content = b"x" + resp.apparent_encoding = "utf-8" + with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp) as m: + _download_file(url, filepath) + m.assert_called_once() + assert m.call_args[1]["timeout"] == DOWNLOAD_TIMEOUT + + +def test_download_file_retries_on_failure(tmp_path): + """_download_file retries up to DOWNLOAD_MAX_RETRIES then returns False.""" + url = "https://example.com/f" + filepath = tmp_path / "f" + with patch("wg21_paper_tracker.pipeline.requests.get") as m: + m.side_effect = Exception("connection error") + with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: + result = _download_file(url, filepath) + assert result is False + assert m.call_count == DOWNLOAD_MAX_RETRIES + assert sleep_mock.call_count == DOWNLOAD_MAX_RETRIES - 1 + + +def test_download_file_succeeds_on_second_attempt(tmp_path): + """_download_file succeeds when a retry succeeds.""" + url = "https://example.com/f" + filepath = tmp_path / "f" + resp = MagicMock() + resp.raise_for_status = MagicMock() + resp.headers = {"content-type": "text/plain"} + resp.content = b"ok" + resp.apparent_encoding = "utf-8" + with patch("wg21_paper_tracker.pipeline.requests.get") as m: + m.side_effect = [Exception("first fail"), resp] + with patch("wg21_paper_tracker.pipeline.time.sleep"): + result = _download_file(url, filepath) + assert result is True + assert m.call_count == 2 + assert filepath.read_text() == "ok" + + +# --- run_tracker_pipeline --- + + +@pytest.mark.django_db +def test_run_tracker_pipeline_returns_zero_when_no_mailings(): + """run_tracker_pipeline returns 0 when fetch_all_mailings returns [].""" + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]): + n = run_tracker_pipeline() + assert n == 0 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_skips_when_no_new_mailings(): + """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" + from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m: + m.return_value = [ + {"mailing_date": "2025-01", "title": "Old", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest", "year": "2025"}, + ] + with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[]): + n = run_tracker_pipeline() + assert n == 0 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_downloads_new_papers(tmp_path): + """run_tracker_pipeline downloads papers for new mailings and returns count.""" + from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-01", "title": "Previous", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers): + with patch("wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path): + with patch("wg21_paper_tracker.pipeline._download_file", return_value=True): + with patch("wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None): + n = run_tracker_pipeline() + assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py new file mode 100644 index 0000000..4463f54 --- /dev/null +++ b/wg21_paper_tracker/tests/test_services.py @@ -0,0 +1,223 @@ +"""Tests for wg21_paper_tracker.services.""" + +from datetime import date +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.services import ( + get_or_create_mailing, + get_or_create_paper, + mark_paper_downloaded, +) + + +# --- get_or_create_mailing --- + + +@pytest.mark.django_db +def test_get_or_create_mailing_creates_new(): + """get_or_create_mailing creates new mailing and returns (mailing, True).""" + m, created = get_or_create_mailing("2025-01", "2025-01 pre-meeting mailing") + assert created is True + assert m.mailing_date == "2025-01" + assert m.title == "2025-01 pre-meeting mailing" + + +@pytest.mark.django_db +def test_get_or_create_mailing_gets_existing(): + """get_or_create_mailing returns existing mailing and (mailing, False).""" + get_or_create_mailing("2025-01", "Original title") + m2, created2 = get_or_create_mailing("2025-01", "Updated title") + assert created2 is False + assert m2.mailing_date == "2025-01" + assert m2.title == "Updated title" # title is updated when different + + +@pytest.mark.django_db +def test_get_or_create_mailing_updates_title_when_different(): + """get_or_create_mailing updates title when existing has different title.""" + get_or_create_mailing("2025-02", "Old title") + m, _ = get_or_create_mailing("2025-02", "New title") + m.refresh_from_db() + assert m.title == "New title" + + +# --- get_or_create_paper --- + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +def test_get_or_create_paper_creates_new(mock_profile, db): + """get_or_create_paper creates new paper and returns (paper, True).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=date(2025, 1, 15), + mailing=mailing, + subgroup="SG1", + author_names=None, + year=2025, + ) + assert created is True + assert paper.paper_id == "p1000r0" + assert paper.title == "A paper" + assert paper.year == 2025 + assert paper.mailing_id == mailing.id + assert paper.subgroup == "SG1" + mock_profile.assert_not_called() + + +@pytest.mark.django_db +@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") +def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): + """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" + from unittest.mock import MagicMock + profile = MagicMock() + profile.pk = 1 + mock_profile.return_value = (profile, True) + + mailing, _ = get_or_create_mailing("2025-01", "Title") + with patch("wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create") as mock_link: + mock_link.return_value = (MagicMock(), True) + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=None, + mailing=mailing, + author_names=["Alice", "Bob"], + year=2025, + ) + assert created is True + assert mock_profile.call_count == 2 + mock_profile.assert_any_call("Alice") + mock_profile.assert_any_call("Bob") + + +@pytest.mark.django_db +def test_get_or_create_paper_normalizes_paper_id_lowercase(db): + """get_or_create_paper stores paper_id in lowercase.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id=" P3039R1 ", + url="https://example.com/p3039r1.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.paper_id == "p3039r1" + + +@pytest.mark.django_db +def test_get_or_create_paper_gets_existing_and_updates(db): + """get_or_create_paper returns existing and updates fields when different.""" + mailing1, _ = get_or_create_mailing("2025-01", "M1") + mailing2, _ = get_or_create_mailing("2025-02", "M2") + get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/old.pdf", + title="Old title", + document_date=date(2025, 1, 1), + mailing=mailing1, + subgroup="SG1", + year=2025, + ) + paper2, created2 = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/new.pdf", + title="New title", + document_date=date(2025, 2, 1), + mailing=mailing2, + subgroup="SG2", + year=2025, + ) + assert created2 is False + paper2.refresh_from_db() + assert paper2.url == "https://example.com/new.pdf" + assert paper2.title == "New title" + assert paper2.mailing_id == mailing2.id + assert paper2.subgroup == "SG2" + + +@pytest.mark.django_db +def test_get_or_create_paper_year_none_stored_as_null(db): + """get_or_create_paper with year=None stores null.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="n5034", + url="https://example.com/n5034.html", + title="Draft", + document_date=None, + mailing=mailing, + year=None, + ) + assert paper.year is None + + +@pytest.mark.django_db +def test_get_or_create_paper_same_paper_id_different_year_creates_two(db): + """get_or_create_paper creates separate rows for same paper_id different year (unique_together).""" + mailing1, _ = get_or_create_mailing("2024-11", "M1") + mailing2, _ = get_or_create_mailing("2025-01", "M2") + p1, c1 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2024.pdf", + title="SD 2024", + document_date=None, + mailing=mailing1, + year=2024, + ) + p2, c2 = get_or_create_paper( + paper_id="sd-1", + url="https://example.com/sd-1-2025.pdf", + title="SD 2025", + document_date=None, + mailing=mailing2, + year=2025, + ) + assert c1 is True and c2 is True + assert p1.pk != p2.pk + assert p1.year == 2024 and p2.year == 2025 + + +# --- mark_paper_downloaded --- + + +@pytest.mark.django_db +def test_mark_paper_downloaded_sets_flag(db): + """mark_paper_downloaded sets is_downloaded=True for matching paper_id.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + assert paper.is_downloaded is False + mark_paper_downloaded("p1000r0") + paper.refresh_from_db() + assert paper.is_downloaded is True + + +@pytest.mark.django_db +def test_mark_paper_downloaded_normalizes_paper_id(db): + """mark_paper_downloaded matches case-insensitively (normalizes to lower).""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p.pdf", + title="T", + document_date=None, + mailing=mailing, + year=2025, + ) + mark_paper_downloaded(" P1000R0 ") + paper.refresh_from_db() + assert paper.is_downloaded is True diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py new file mode 100644 index 0000000..25a828e --- /dev/null +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -0,0 +1,73 @@ +"""Tests for wg21_paper_tracker.workspace.""" + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from wg21_paper_tracker.workspace import get_workspace_root, get_raw_dir + + +@pytest.fixture +def mock_workspace_path(tmp_path): + """Patch get_workspace_path to return tmp_path for app slugs.""" + + def _get_path(app_slug): + p = tmp_path / app_slug.replace("/", "_") + p.mkdir(parents=True, exist_ok=True) + return p + + with patch("wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path): + yield tmp_path + + +def test_get_workspace_root_returns_path(mock_workspace_path): + """get_workspace_root returns Path for app workspace.""" + root = get_workspace_root() + assert "wg21_paper_tracker" in str(root) + assert root.is_dir() + + +def test_get_workspace_root_calls_get_workspace_path_with_slug(): + """get_workspace_root calls get_workspace_path with app slug.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + m.return_value = Path("/fake/workspace/wg21_paper_tracker") + root = get_workspace_root() + m.assert_called_once_with("wg21_paper_tracker") + assert root == Path("/fake/workspace/wg21_paper_tracker") + + +def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): + """get_raw_dir returns raw/wg21_paper_tracker//.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw_wg21_paper_tracker" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: { + "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", + "raw/wg21_paper_tracker": raw_root, + }[slug] + path = get_raw_dir("2025-01") + assert path == raw_root / "2025-01" + assert path.is_dir() + + +def test_get_raw_dir_creates_parents(mock_workspace_path): + """get_raw_dir creates parent directories.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw_app" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: raw_root if "raw" in slug else (mock_workspace_path / "app") + path = get_raw_dir("2026-02") + assert path.exists() + assert path.name == "2026-02" + + +def test_get_raw_dir_idempotent(mock_workspace_path): + """get_raw_dir can be called twice for same mailing_date without error.""" + with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: + raw_root = mock_workspace_path / "raw" + raw_root.mkdir(parents=True, exist_ok=True) + m.side_effect = lambda slug: raw_root + p1 = get_raw_dir("2025-01") + p2 = get_raw_dir("2025-01") + assert p1 == p2 From 18f07c3536f7cf841c91d1fffb6d7b7f81827d61 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 9 Mar 2026 19:22:05 -0700 Subject: [PATCH 03/20] Fix lint/format error #24 --- .../commands/import_wg21_metadata_from_csv.py | 25 +++++++++++-------- wg21_paper_tracker/tests/test_fetcher.py | 6 +++-- wg21_paper_tracker/tests/test_models.py | 2 +- wg21_paper_tracker/tests/test_pipeline.py | 23 ++++++++++++----- wg21_paper_tracker/tests/test_services.py | 6 +++-- wg21_paper_tracker/tests/test_workspace.py | 8 ++++-- 6 files changed, 46 insertions(+), 24 deletions(-) diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 5d4a398..966ce64 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -41,11 +41,7 @@ def _normalize_title(raw: str) -> str: if not raw: return "" one_line = " ".join(raw.split()) - return ( - one_line[:TITLE_MAX_LENGTH] - if len(one_line) > TITLE_MAX_LENGTH - else one_line - ) + return one_line[:TITLE_MAX_LENGTH] if len(one_line) > TITLE_MAX_LENGTH else one_line def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]: @@ -117,9 +113,7 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): - csv_path = options.get("csv_file") or ( - get_workspace_root() / "metadata.csv" - ) + csv_path = options.get("csv_file") or (get_workspace_root() / "metadata.csv") dry_run = options["dry_run"] if not csv_path.exists(): @@ -224,16 +218,25 @@ def handle(self, *args, **options): paper.year = year paper.save() if author_names: - from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile + from cppa_user_tracker.services import ( + get_or_create_wg21_paper_author_profile, + ) + for name in author_names: - profile, _ = get_or_create_wg21_paper_author_profile(name) + profile, _ = get_or_create_wg21_paper_author_profile( + name + ) WG21PaperAuthor.objects.get_or_create( paper=paper, profile=profile, ) except Exception as inner: stats["skipped"] += 1 - logger.error("Error for paper_id=%s (after IntegrityError): %s", paper_id, inner) + logger.error( + "Error for paper_id=%s (after IntegrityError): %s", + paper_id, + inner, + ) except Exception as e: stats["skipped"] += 1 logger.error("Error for paper_id=%s: %s", paper_id, e) diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index a06317a..70a2338 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -2,7 +2,6 @@ from unittest.mock import patch, MagicMock -import pytest from wg21_paper_tracker.fetcher import ( BASE_URL, @@ -174,6 +173,9 @@ def test_fetch_papers_for_mailing_returns_empty_when_no_table(): def test_fetch_papers_for_mailing_calls_year_url(): """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.return_value = MagicMock(text="", raise_for_status=MagicMock()) + m.return_value = MagicMock( + text="", + raise_for_status=MagicMock(), + ) fetch_papers_for_mailing("2025", "2025-01") m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py index ca92819..5d9a1ac 100644 --- a/wg21_paper_tracker/tests/test_models.py +++ b/wg21_paper_tracker/tests/test_models.py @@ -4,7 +4,7 @@ import pytest -from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.models import WG21Mailing, WG21Paper @pytest.mark.django_db diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index e052ce9..ad4df9c 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,6 +1,5 @@ """Tests for wg21_paper_tracker.pipeline.""" -import time from pathlib import Path from unittest.mock import patch, MagicMock @@ -108,13 +107,16 @@ def test_run_tracker_pipeline_returns_zero_when_no_mailings(): def test_run_tracker_pipeline_skips_when_no_new_mailings(): """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m: m.return_value = [ {"mailing_date": "2025-01", "title": "Old", "year": "2025"}, {"mailing_date": "2025-02", "title": "Latest", "year": "2025"}, ] - with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[]): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[] + ): n = run_tracker_pipeline() assert n == 0 @@ -123,6 +125,7 @@ def test_run_tracker_pipeline_skips_when_no_new_mailings(): def test_run_tracker_pipeline_downloads_new_papers(tmp_path): """run_tracker_pipeline downloads papers for new mailings and returns count.""" from wg21_paper_tracker.models import WG21Mailing + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") mailings = [ {"mailing_date": "2025-01", "title": "Previous", "year": "2025"}, @@ -141,9 +144,17 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): }, ] with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): - with patch("wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers): - with patch("wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path): - with patch("wg21_paper_tracker.pipeline._download_file", return_value=True): - with patch("wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + with patch( + "wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path + ): + with patch( + "wg21_paper_tracker.pipeline._download_file", return_value=True + ): + with patch( + "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None + ): n = run_tracker_pipeline() assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 4463f54..a0a9b6f 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -5,7 +5,6 @@ import pytest -from wg21_paper_tracker.models import WG21Mailing, WG21Paper from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, @@ -76,12 +75,15 @@ def test_get_or_create_paper_creates_new(mock_profile, db): def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" from unittest.mock import MagicMock + profile = MagicMock() profile.pk = 1 mock_profile.return_value = (profile, True) mailing, _ = get_or_create_mailing("2025-01", "Title") - with patch("wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create") as mock_link: + with patch( + "wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create" + ) as mock_link: mock_link.return_value = (MagicMock(), True) paper, created = get_or_create_paper( paper_id="p1000r0", diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 25a828e..3689ae9 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -17,7 +17,9 @@ def _get_path(app_slug): p.mkdir(parents=True, exist_ok=True) return p - with patch("wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path): + with patch( + "wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path + ): yield tmp_path @@ -56,7 +58,9 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: raw_root = mock_workspace_path / "raw_app" raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: raw_root if "raw" in slug else (mock_workspace_path / "app") + m.side_effect = lambda slug: ( + raw_root if "raw" in slug else (mock_workspace_path / "app") + ) path = get_raw_dir("2026-02") assert path.exists() assert path.name == "2026-02" From f4388ffabb8fd007768d877125518a9afbab4788 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 13:51:22 -0700 Subject: [PATCH 04/20] Validate mailing_date in get_raw_dir; WG21 author order/resolution and docs #24 --- ...0006_wg21paperauthorprofile_author_alas.py | 19 +++++ cppa_user_tracker/models.py | 1 + cppa_user_tracker/services.py | 34 +++++++- cppa_user_tracker/tests/test_services.py | 77 +++++++++++++++++++ docs/Schema.md | 15 ++-- docs/operations/WG21_Cloud_Run.md | 4 +- docs/service_api/cppa_user_tracker.md | 8 ++ wg21_paper_tracker/cloud_run_job/Dockerfile | 6 +- .../converters/openai_converter.py | 23 +++--- .../converters/pdfplumber_converter.py | 13 +++- wg21_paper_tracker/cloud_run_job/main.py | 4 +- .../cloud_run_job/requirements.txt | 2 +- wg21_paper_tracker/fetcher.py | 8 +- .../commands/import_wg21_metadata_from_csv.py | 10 +-- .../commands/run_wg21_paper_tracker.py | 6 +- wg21_paper_tracker/migrations/0001_initial.py | 8 +- wg21_paper_tracker/models.py | 5 +- wg21_paper_tracker/pipeline.py | 20 ++++- wg21_paper_tracker/services.py | 54 ++++++++++--- wg21_paper_tracker/tests/test_fetcher.py | 13 ++-- wg21_paper_tracker/tests/test_pipeline.py | 5 +- wg21_paper_tracker/tests/test_services.py | 71 +++++++++++------ wg21_paper_tracker/tests/test_workspace.py | 20 ++++- wg21_paper_tracker/workspace.py | 4 + 24 files changed, 338 insertions(+), 92 deletions(-) create mode 100644 cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py new file mode 100644 index 0000000..9c47bb5 --- /dev/null +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py @@ -0,0 +1,19 @@ +# Generated by Django 4.2.28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ] + + operations = [ + migrations.AddField( + model_name="wg21paperauthorprofile", + name="author_alas", + field=models.CharField(blank=True, db_index=True, default="", max_length=255), + preserve_default=True, + ), + ] diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 46be627..75a52c6 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -165,6 +165,7 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) display_name = models.CharField(max_length=255, db_index=True, blank=True) + author_alas = models.CharField(max_length=255, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 35b4e31..146f778 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -357,9 +357,35 @@ def get_or_create_discord_profile( def get_or_create_wg21_paper_author_profile( display_name: str, -) -> tuple[Any, bool]: - """Get or create a WG21PaperAuthorProfile by display_name.""" + email: Optional[str] = None, +) -> tuple[WG21PaperAuthorProfile, bool]: + """Get or create a WG21PaperAuthorProfile by display_name, with optional email disambiguation. + + Finds all profiles with the given display_name. If none exist, creates one and adds + email if provided. If one exists, returns it. If multiple exist, and email is + provided, returns the one with that email if any; otherwise returns the first. + """ display_name_val = (display_name or "").strip() - return WG21PaperAuthorProfile.objects.get_or_create( - display_name=display_name_val, + email_val = (email or "").strip() or None + + candidates = list( + WG21PaperAuthorProfile.objects.filter(display_name=display_name_val).order_by( + "id" + ) ) + + if not candidates: + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) + if email_val: + add_email(profile, email_val, is_primary=True) + return profile, True + + if len(candidates) == 1: + return candidates[0], False + + # Two or more: disambiguate by email if provided + if email_val: + for p in candidates: + if p.emails.filter(email=email_val).exists(): + return p, False + return candidates[0], False diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index cf61481..6d4d85b 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -8,6 +8,7 @@ GitHubAccountType, Identity, TempProfileIdentityRelation, + WG21PaperAuthorProfile, ) from cppa_user_tracker import services @@ -569,3 +570,79 @@ def test_get_or_create_mailing_list_profile_strips_display_name_and_email(): assert created is True assert profile.display_name == "Trimmed" assert profile.emails.filter(email="trimmed@example.com").exists() + + +# --- get_or_create_wg21_paper_author_profile --- + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_creates(): + """get_or_create_wg21_paper_author_profile creates new profile when none exist.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="New Author" + ) + assert created is True + assert profile.display_name == "New Author" + assert WG21PaperAuthorProfile.objects.filter(display_name="New Author").count() == 1 + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_no_candidates_with_email_adds_email(): + """get_or_create_wg21_paper_author_profile adds email to new profile when provided.""" + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Author With Email", + email="author@example.com", + ) + assert created is True + assert profile.emails.filter(email="author@example.com").exists() + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): + """get_or_create_wg21_paper_author_profile returns existing profile when exactly one matches.""" + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author" + ) + assert created is False + assert profile.id == existing.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first(): + """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + _second = WG21PaperAuthorProfile.objects.create(display_name="Dup Name") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Dup Name" + ) + assert created is False + assert profile.id == first.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_second(): + """get_or_create_wg21_paper_author_profile returns profile with matching email when multiple match.""" + _first = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Same Name") + services.add_email(second, "match@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Same Name", + email="match@example.com", + ) + assert created is False + assert profile.id == second.id + + +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_returns_first(): + """get_or_create_wg21_paper_author_profile returns first when email provided but no match.""" + first = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + second = WG21PaperAuthorProfile.objects.create(display_name="Other Name") + services.add_email(second, "other@example.com", is_primary=True) + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Other Name", + email="nomatch@example.com", + ) + assert created is False + assert profile.id == first.id diff --git a/docs/Schema.md b/docs/Schema.md index 308a662..6b22def 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -68,6 +68,7 @@ erDiagram WG21PaperAuthorProfile { string display_name "IX" + string author_alas "IX" datetime created_at datetime updated_at } @@ -618,6 +619,7 @@ erDiagram int id PK int paper_id FK int profile_id FK + int author_order datetime created_at } @@ -631,7 +633,8 @@ erDiagram WG21Paper { int id PK - string paper_id UK "IX" + string paper_id "IX" + int year "IX" string url string title "IX" date document_date "IX" @@ -647,7 +650,9 @@ erDiagram **Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing. -**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. +**Note:** **WG21Paper** is uniquely identified by the composite `(paper_id, year)`; `paper_id` is not globally unique. The same paper identifier may appear in different years (e.g. revisions). + +**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. `author_order` is optional and 1-based; it indicates the order of authors on the paper. --- @@ -720,7 +725,7 @@ erDiagram | **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | | **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | -| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. | 1 | +| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alas`. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | | **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | @@ -761,8 +766,8 @@ erDiagram | **SlackChannelMembership** | Channel-member link (slack_user_id, is_restricted, is_deleted). | 6 | | **SlackChannelMembershipChangeLog** | Log of join/leave (slack_user_id, is_joined, created_at). | 6 | | **WG21Mailing** | WG21 mailing release (mailing_date, title). | 7 | -| **WG21Paper** | WG21 paper (paper_id, url, title, document_date, mailing, subgroup, is_downloaded). | 7 | -| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile). | 7 | +| **WG21Paper** | WG21 paper (paper_id, year, url, title, document_date, mailing, subgroup, is_downloaded). Unique on (paper_id, year). | 7 | +| **WG21PaperAuthor** | Paper-author link (paper_id, profile_id→WG21PaperAuthorProfile). Optional `author_order` (1-based) for ordering. | 7 | | **Website** | Daily site visit total (stat_date, website_visit_count). | 8 | | **WebsiteVisitCount** | Per-date, per-country visit count. | 8 | | **WebsiteWordCount** | Per-date, per-word count. | 8 | diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index 257e2bc..7840bd1 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -44,9 +44,11 @@ gcloud run jobs create wg21-convert \ --memory 8Gi \ --cpu 4 \ --region us-central1 \ - --set-env-vars WG21_GCS_BUCKET=wg21-data-collector,OPENROUTER_API_KEY=your_key + --set-env-vars WG21_GCS_BUCKET=wg21-data-collector ``` +Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manager](https://cloud.google.com/run/docs/configuring/secrets)) rather than inline in `--set-env-vars`, to avoid leaking the key into shell history, CI logs, or audit trails. + ## 4. Service Account & IAM Permissions 1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index f638501..4ca0adb 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -41,6 +41,14 @@ --- +## WG21PaperAuthorProfile + +| Function | Parameter types | Return type | Description | +| -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name. If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist, and email is provided, returns the one with that email if any; otherwise returns the first. Use this when linking paper authors so that same name + same email link to the same profile. | + +--- + ## DiscordProfile | Function | Parameter types | Return type | Description | diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile index 21b51ef..d52244b 100644 --- a/wg21_paper_tracker/cloud_run_job/Dockerfile +++ b/wg21_paper_tracker/cloud_run_job/Dockerfile @@ -11,6 +11,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libglib2.0-0 \ && rm -rf /var/lib/apt/lists/* +RUN groupadd -r app && useradd -r -g app app + # Copy requirements COPY requirements.txt . @@ -18,7 +20,9 @@ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Copy application files -COPY . . +COPY --chown=app:app . . + +USER app # Run the main script CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index cd168aa..3a94230 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -2,12 +2,16 @@ OpenAI/OpenRouter-based PDF to Markdown converter with OCR. """ -import os +from __future__ import annotations + import base64 +import io +import logging +import os from pathlib import Path from typing import Optional + import requests -import logging logger = logging.getLogger(__name__) @@ -21,7 +25,6 @@ try: from pdf2image import convert_from_path from PIL import Image, ImageOps - import io PDF2IMAGE_AVAILABLE = True except ImportError: @@ -211,6 +214,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: total_pages = len(images) markdown_parts = [] + successful_pages = 0 # Process each page for page_num, image in enumerate(images, 1): @@ -226,6 +230,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: if page_markdown: markdown_parts.append(page_markdown) markdown_parts.append("\n\n") + successful_pages += 1 else: logger.warning(f"Failed to convert page {page_num} with OpenAI") markdown_parts.append( @@ -243,17 +248,17 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: markdown_content = "".join(markdown_parts) - if markdown_content and len(markdown_content.strip()) > 0: + if successful_pages > 0 and markdown_content.strip(): logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") logger.info( f"Extracted {len(markdown_content)} characters from {total_pages} pages" ) return markdown_content - else: - logger.warning( - f"OpenAI/OpenRouter conversion returned empty content for: {pdf_path.name}" - ) - return None + logger.warning( + "OpenAI/OpenRouter conversion produced no usable pages for: %s", + pdf_path.name, + ) + return None except Exception as e: logger.error( diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 58a1465..6329c5a 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -46,7 +46,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: text = page.extract_text() if text: - markdown_parts.append(text) + markdown_parts.append(text.replace("\n", " \n")) markdown_parts.append("\n\n") # Extract tables if any @@ -55,6 +55,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: for table in tables: if table: markdown_parts.append("\n### Table\n\n") + first_row = True # Convert table to markdown format for row in table: if row: @@ -66,6 +67,13 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: ) + " |\n" ) + if first_row: + markdown_parts.append( + "| " + + " | ".join("---" for _ in row) + + " |\n" + ) + first_row = False markdown_parts.append("\n") except Exception as e: @@ -88,6 +96,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: except Exception as e: logger.error( - f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", exc_info=True + f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", + exc_info=True, ) return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index cf704ae..cdfe40a 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -2,6 +2,8 @@ import logging from pathlib import Path import tempfile +from typing import Optional + from google.cloud import storage from converters.docling_converter import convert_with_docling @@ -16,7 +18,7 @@ MIN_CONTENT_LENGTH = 50 -def is_content_valid(content: str) -> bool: +def is_content_valid(content: Optional[str]) -> bool: if not content: return False content_stripped = content.strip() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt index 0a00731..096efc5 100644 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -1,6 +1,6 @@ docling>=1.0.0 pdfplumber>=0.10.0 pdf2image>=1.16.0 -Pillow>=10.0.0 +Pillow>=10.3.0 requests>=2.31.0 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index e733e83..4e44bd0 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -29,8 +29,8 @@ def fetch_all_mailings() -> list[dict]: try: response = requests.get(f"{BASE_URL}/", timeout=30) response.raise_for_status() - except Exception as e: - logger.error("Failed to fetch WG21 index: %s", e) + except requests.RequestException: + logger.error("Failed to fetch WG21 index.") return [] # The mailings are listed in a markdown-like syntax or links @@ -65,8 +65,8 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: try: response = requests.get(url, timeout=30) response.raise_for_status() - except Exception as e: - logger.error("Failed to fetch year page %s: %s", year, e) + except requests.RequestException: + logger.error("Failed to fetch year page %s.", year) return [] soup = BeautifulSoup(response.text, "html.parser") diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 966ce64..0d1b903 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -16,10 +16,11 @@ from django.db import IntegrityError from django.utils.dateparse import parse_date -from wg21_paper_tracker.models import WG21Paper, WG21PaperAuthor +from wg21_paper_tracker.models import WG21Paper from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, + get_or_create_paper_author, ) from wg21_paper_tracker.workspace import get_workspace_root @@ -222,14 +223,11 @@ def handle(self, *args, **options): get_or_create_wg21_paper_author_profile, ) - for name in author_names: + for i, name in enumerate(author_names): profile, _ = get_or_create_wg21_paper_author_profile( name ) - WG21PaperAuthor.objects.get_or_create( - paper=paper, - profile=profile, - ) + get_or_create_paper_author(paper, profile, i + 1) except Exception as inner: stats["skipped"] += 1 logger.error( diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index f771043..3945269 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -65,9 +65,9 @@ def handle(self, *args, **options): logger.info("Downloaded and uploaded %d new papers.", total_new_papers) if total_new_papers > 0: - project_id = settings.GCP_PROJECT_ID - location = settings.GCP_LOCATION - job_name = settings.WG21_CLOUD_RUN_JOB_NAME + project_id = getattr(settings, "GCP_PROJECT_ID", None) + location = getattr(settings, "GCP_LOCATION", "us-central1") + job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) if project_id and job_name: try: diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index b4f9635..a2bbf3d 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Merged initial migration: WG21 Mailing, WG21 Paper (with year), WG21 Paper Author +# Merged initial migration: WG21 Mailing, WG21 Paper (year not null), WG21 Paper Author from django.db import migrations, models import django.db.models.deletion @@ -59,10 +59,7 @@ class Migration(migrations.Migration): "document_date", models.DateField(blank=True, db_index=True, null=True), ), - ( - "year", - models.IntegerField(blank=True, db_index=True, null=True), - ), + ("year", models.IntegerField(db_index=True, default=0)), ( "subgroup", models.CharField( @@ -104,6 +101,7 @@ class Migration(migrations.Migration): verbose_name="ID", ), ), + ("author_order", models.PositiveIntegerField(blank=True, null=True)), ("created_at", models.DateTimeField(auto_now_add=True)), ( "paper", diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py index 44754b4..fede57b 100644 --- a/wg21_paper_tracker/models.py +++ b/wg21_paper_tracker/models.py @@ -30,7 +30,7 @@ class WG21Paper(models.Model): url = models.URLField(max_length=1024) title = models.CharField(max_length=1024, db_index=True) document_date = models.DateField(db_index=True, null=True, blank=True) - year = models.IntegerField(null=True, blank=True, db_index=True) + year = models.IntegerField(default=0, db_index=True) mailing = models.ForeignKey( WG21Mailing, on_delete=models.CASCADE, @@ -42,7 +42,7 @@ class WG21Paper(models.Model): updated_at = models.DateTimeField(auto_now=True) class Meta: - unique_together = (("paper_id", "year"),) + unique_together = [["paper_id", "year"]] ordering = ["-document_date", "-paper_id", "-year"] verbose_name = "WG21 Paper" verbose_name_plural = "WG21 Papers" @@ -66,6 +66,7 @@ class WG21PaperAuthor(models.Model): related_name="papers", db_column="profile_id", ) + author_order = models.PositiveIntegerField(null=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) class Meta: diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index edcf003..16a9bb6 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -186,10 +186,14 @@ def format_priority(ext: str) -> int: raw_dir = get_raw_dir(mailing_date) skipped_downloaded = 0 + year_val = year if year is not None else 0 for pid, p_list in papers_by_id.items(): - # Check DB if this paper_id is already fully downloaded - existing_paper = WG21Paper.objects.filter(paper_id=pid).first() - if existing_paper and existing_paper.is_downloaded: + # Skip only if this (paper_id, year) is already downloaded + if WG21Paper.objects.filter( + paper_id=pid, + year=year_val, + is_downloaded=True, + ).exists(): skipped_downloaded += 1 continue @@ -197,7 +201,15 @@ def format_priority(ext: str) -> int: p_list.sort(key=lambda x: format_priority(x["type"])) best_paper = p_list[0] - filename = best_paper["filename"] + raw_filename = (best_paper.get("filename") or "").strip() + filename = Path(raw_filename).name + if not filename or filename != raw_filename: + logger.warning( + "Skipping paper %s due to unsafe filename %r", + pid, + raw_filename, + ) + continue local_path = raw_dir / filename url = best_paper["url"] diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index cf846b0..4328711 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -2,13 +2,18 @@ Database logic for WG21 Paper Tracker. """ -from typing import Optional +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional from django.db import transaction from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor +if TYPE_CHECKING: + from cppa_user_tracker.models import WG21PaperAuthorProfile + @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: @@ -30,10 +35,11 @@ def get_or_create_paper( mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, + author_emails: Optional[list[str]] = None, year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() - year_val = None + year_val = 0 if year: s = (year if isinstance(year, str) else str(year)).strip()[:4] if s.isdigit(): @@ -66,23 +72,49 @@ def get_or_create_paper( if paper.subgroup != subgroup: paper.subgroup = subgroup updated = True - if year_val is not None and paper.year != year_val: + if paper.year != year_val: paper.year = year_val updated = True if updated: paper.save() if author_names: - for name in author_names: - profile, _ = get_or_create_wg21_paper_author_profile(name) - WG21PaperAuthor.objects.get_or_create( - paper=paper, - profile=profile, - ) + emails = author_emails or [] + for i, name in enumerate(author_names): + email = emails[i] if i < len(emails) else None + profile, _ = get_or_create_wg21_paper_author_profile(name, email=email) + get_or_create_paper_author(paper, profile, i + 1) return paper, created -def mark_paper_downloaded(paper_id: str): +def get_or_create_paper_author( + paper: WG21Paper, + profile: WG21PaperAuthorProfile, + author_order: int, +) -> tuple[WG21PaperAuthor, bool]: + """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based). + Updates author_order on existing link if it differs. + """ + link, link_created = WG21PaperAuthor.objects.get_or_create( + paper=paper, + profile=profile, + defaults={"author_order": author_order}, + ) + if not link_created and link.author_order != author_order: + link.author_order = author_order + link.save(update_fields=["author_order"]) + return link, link_created + + +def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() - WG21Paper.objects.filter(paper_id=paper_id).update(is_downloaded=True) + year_val = 0 + if year is not None: + s = (year if isinstance(year, str) else str(year)).strip()[:4] + if s.isdigit(): + year_val = int(s) + WG21Paper.objects.filter( + paper_id=paper_id, + year=year_val, + ).update(is_downloaded=True) diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 70a2338..8b2ffec 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -2,6 +2,7 @@ from unittest.mock import patch, MagicMock +import requests from wg21_paper_tracker.fetcher import ( BASE_URL, @@ -14,18 +15,18 @@ def test_fetch_all_mailings_returns_empty_on_request_failure(): - """fetch_all_mailings returns [] when requests.get raises.""" + """fetch_all_mailings returns [] when requests.get raises RequestException.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.side_effect = Exception("network error") + m.side_effect = requests.RequestException("network error") result = fetch_all_mailings() assert result == [] def test_fetch_all_mailings_returns_empty_on_http_error(): - """fetch_all_mailings returns [] when response.raise_for_status raises.""" + """fetch_all_mailings returns [] when response.raise_for_status raises HTTPError.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: resp = MagicMock() - resp.raise_for_status.side_effect = Exception("404") + resp.raise_for_status.side_effect = requests.HTTPError("404") m.return_value = resp result = fetch_all_mailings() assert result == [] @@ -68,9 +69,9 @@ def test_fetch_all_mailings_calls_index_url(): def test_fetch_papers_for_mailing_returns_empty_on_request_failure(): - """fetch_papers_for_mailing returns [] when requests.get raises.""" + """fetch_papers_for_mailing returns [] when requests.get raises RequestException.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: - m.side_effect = Exception("timeout") + m.side_effect = requests.RequestException("timeout") result = fetch_papers_for_mailing("2025", "2025-01") assert result == [] diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index ad4df9c..592ceec 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -4,6 +4,7 @@ from unittest.mock import patch, MagicMock import pytest +import requests from wg21_paper_tracker.pipeline import ( DOWNLOAD_TIMEOUT, @@ -66,7 +67,7 @@ def test_download_file_retries_on_failure(tmp_path): url = "https://example.com/f" filepath = tmp_path / "f" with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = Exception("connection error") + m.side_effect = requests.RequestException("connection error") with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: result = _download_file(url, filepath) assert result is False @@ -84,7 +85,7 @@ def test_download_file_succeeds_on_second_attempt(tmp_path): resp.content = b"ok" resp.apparent_encoding = "utf-8" with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = [Exception("first fail"), resp] + m.side_effect = [requests.RequestException("first fail"), resp] with patch("wg21_paper_tracker.pipeline.time.sleep"): result = _download_file(url, filepath) assert result is True diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index a0a9b6f..6ec6a00 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -72,32 +72,35 @@ def test_get_or_create_paper_creates_new(mock_profile, db): @pytest.mark.django_db @patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile") -def test_get_or_create_paper_calls_author_profile_for_each_author(mock_profile, db): - """get_or_create_paper calls get_or_create_wg21_paper_author_profile for each author name.""" +@patch("wg21_paper_tracker.services.get_or_create_paper_author") +def test_get_or_create_paper_calls_author_profile_for_each_author( + mock_get_or_create_paper_author, mock_profile, db +): + """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author.""" from unittest.mock import MagicMock profile = MagicMock() profile.pk = 1 mock_profile.return_value = (profile, True) + mock_get_or_create_paper_author.return_value = (MagicMock(), True) mailing, _ = get_or_create_mailing("2025-01", "Title") - with patch( - "wg21_paper_tracker.services.WG21PaperAuthor.objects.get_or_create" - ) as mock_link: - mock_link.return_value = (MagicMock(), True) - paper, created = get_or_create_paper( - paper_id="p1000r0", - url="https://example.com/p1000r0.pdf", - title="A paper", - document_date=None, - mailing=mailing, - author_names=["Alice", "Bob"], - year=2025, - ) + paper, created = get_or_create_paper( + paper_id="p1000r0", + url="https://example.com/p1000r0.pdf", + title="A paper", + document_date=None, + mailing=mailing, + author_names=["Alice", "Bob"], + year=2025, + ) assert created is True assert mock_profile.call_count == 2 - mock_profile.assert_any_call("Alice") - mock_profile.assert_any_call("Bob") + mock_profile.assert_any_call("Alice", email=None) + mock_profile.assert_any_call("Bob", email=None) + assert mock_get_or_create_paper_author.call_count == 2 + mock_get_or_create_paper_author.assert_any_call(paper, profile, 1) + mock_get_or_create_paper_author.assert_any_call(paper, profile, 2) @pytest.mark.django_db @@ -147,8 +150,8 @@ def test_get_or_create_paper_gets_existing_and_updates(db): @pytest.mark.django_db -def test_get_or_create_paper_year_none_stored_as_null(db): - """get_or_create_paper with year=None stores null.""" +def test_get_or_create_paper_year_none_stored_as_zero(db): + """get_or_create_paper with year=None stores 0 for unknown year.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="n5034", @@ -158,7 +161,7 @@ def test_get_or_create_paper_year_none_stored_as_null(db): mailing=mailing, year=None, ) - assert paper.year is None + assert paper.year == 0 @pytest.mark.django_db @@ -187,12 +190,32 @@ def test_get_or_create_paper_same_paper_id_different_year_creates_two(db): assert p1.year == 2024 and p2.year == 2025 +@pytest.mark.django_db +def test_get_or_create_paper_sets_author_order(db): + """get_or_create_paper sets author_order (1-based) on WG21PaperAuthor links.""" + mailing, _ = get_or_create_mailing("2025-01", "Title") + paper, _ = get_or_create_paper( + paper_id="p9999", + url="https://example.com/p9999.pdf", + title="Multi-author paper", + document_date=None, + mailing=mailing, + author_names=["First Author", "Second Author", "Third Author"], + year=2025, + ) + links = list(paper.authors.order_by("author_order")) + assert len(links) == 3 + assert links[0].author_order == 1 + assert links[1].author_order == 2 + assert links[2].author_order == 3 + + # --- mark_paper_downloaded --- @pytest.mark.django_db def test_mark_paper_downloaded_sets_flag(db): - """mark_paper_downloaded sets is_downloaded=True for matching paper_id.""" + """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year).""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="p1000r0", @@ -203,14 +226,14 @@ def test_mark_paper_downloaded_sets_flag(db): year=2025, ) assert paper.is_downloaded is False - mark_paper_downloaded("p1000r0") + mark_paper_downloaded("p1000r0", year=2025) paper.refresh_from_db() assert paper.is_downloaded is True @pytest.mark.django_db def test_mark_paper_downloaded_normalizes_paper_id(db): - """mark_paper_downloaded matches case-insensitively (normalizes to lower).""" + """mark_paper_downloaded matches case-insensitively (normalizes to lower) and by year.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="p1000r0", @@ -220,6 +243,6 @@ def test_mark_paper_downloaded_normalizes_paper_id(db): mailing=mailing, year=2025, ) - mark_paper_downloaded(" P1000R0 ") + mark_paper_downloaded(" P1000R0 ", year=2025) paper.refresh_from_db() assert paper.is_downloaded is True diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 3689ae9..8c8365e 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -18,7 +18,8 @@ def _get_path(app_slug): return p with patch( - "wg21_paper_tracker.workspace.get_workspace_path", side_effect=_get_path + "wg21_paper_tracker.workspace.get_workspace_path", + side_effect=_get_path, ): yield tmp_path @@ -75,3 +76,20 @@ def test_get_raw_dir_idempotent(mock_workspace_path): p1 = get_raw_dir("2025-01") p2 = get_raw_dir("2025-01") assert p1 == p2 + assert p1.parent == p2.parent + + +def test_get_raw_dir_rejects_invalid_mailing_date(): + """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.).""" + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("../../tmp") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-1") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-13") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("2025-00") + with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): + get_raw_dir("") diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 19c0d1b..89b853b 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -3,12 +3,14 @@ Temporary file storage during download before uploading to GCS. """ +import re from pathlib import Path from config.workspace import get_workspace_path _APP_SLUG = "wg21_paper_tracker" _RAW_APP_SLUG = f"raw/{_APP_SLUG}" +_MAILING_DATE_RE = re.compile(r"^\d{4}-(0[1-9]|1[0-2])$") def get_workspace_root() -> Path: @@ -17,6 +19,8 @@ def get_workspace_root() -> Path: def get_raw_dir(mailing_date: str) -> Path: """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" + if not _MAILING_DATE_RE.fullmatch(mailing_date): + raise ValueError("mailing_date must be in YYYY-MM format") raw_root = get_workspace_path(_RAW_APP_SLUG) path = raw_root / mailing_date path.mkdir(parents=True, exist_ok=True) From 62d5d427aff6c64c66ce1572110c7291b809f2c0 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 14:39:05 -0700 Subject: [PATCH 05/20] Fix: WG21 tracker (year, GCS guard, IntegrityError), author_alias, Pillow, test fixes #24 --- ...06_wg21paperauthorprofile_author_alias.py} | 2 +- cppa_user_tracker/models.py | 2 +- docs/Schema.md | 4 +- .../converters/openai_converter.py | 2 +- wg21_paper_tracker/cloud_run_job/main.py | 2 +- .../cloud_run_job/requirements.txt | 2 +- .../commands/import_wg21_metadata_from_csv.py | 7 +++- .../commands/run_wg21_paper_tracker.py | 11 ++++-- wg21_paper_tracker/pipeline.py | 38 +++++++++++++++---- wg21_paper_tracker/tests/test_models.py | 12 +++++- wg21_paper_tracker/tests/test_pipeline.py | 9 ++++- wg21_paper_tracker/tests/test_services.py | 17 ++++++--- 12 files changed, 79 insertions(+), 29 deletions(-) rename cppa_user_tracker/migrations/{0006_wg21paperauthorprofile_author_alas.py => 0006_wg21paperauthorprofile_author_alias.py} (93%) diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py similarity index 93% rename from cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py rename to cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py index 9c47bb5..674176a 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alas.py +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py @@ -12,7 +12,7 @@ class Migration(migrations.Migration): operations = [ migrations.AddField( model_name="wg21paperauthorprofile", - name="author_alas", + name="author_alias", field=models.CharField(blank=True, db_index=True, default="", max_length=255), preserve_default=True, ), diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py index 75a52c6..70dca3d 100644 --- a/cppa_user_tracker/models.py +++ b/cppa_user_tracker/models.py @@ -165,7 +165,7 @@ def save(self, *args, **kwargs): super().save(*args, **kwargs) display_name = models.CharField(max_length=255, db_index=True, blank=True) - author_alas = models.CharField(max_length=255, blank=True, db_index=True) + author_alias = models.CharField(max_length=255, blank=True, db_index=True) created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) diff --git a/docs/Schema.md b/docs/Schema.md index 6b22def..12e676d 100644 --- a/docs/Schema.md +++ b/docs/Schema.md @@ -68,7 +68,7 @@ erDiagram WG21PaperAuthorProfile { string display_name "IX" - string author_alas "IX" + string author_alias "IX" datetime created_at datetime updated_at } @@ -725,7 +725,7 @@ erDiagram | **GitHubAccount** | Profile for GitHub (user/org/enterprise); extends BaseProfile. | 1 | | **SlackUser** | Profile for Slack; extends BaseProfile. | 1 | | **MailingListProfile** | Profile for mailing list; extends BaseProfile. | 1 | -| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alas`. | 1 | +| **WG21PaperAuthorProfile** | Profile for WG21 paper authors; extends BaseProfile. Optional `author_alias`. | 1 | | **TmpIdentity** | Temporary identity for staging (CPPA User Tracker). | 1 | | **TempProfileIdentityRelation** | Staging table: base_profile_id -> target_identity_id (CPPA User Tracker). | 1 | | **GitHubRepository** | Repository metadata (owner, repo_name, stars, forks, etc.). Base table for repo subtypes. | 2 | diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 3a94230..078e984 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -242,7 +242,7 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: f"Error processing page {page_num}: {str(e)}", exc_info=True ) markdown_parts.append( - f"## Page {page_num}\n\n*[Error processing this page: {str(e)}]*\n\n" + f"## Page {page_num}\n\n*[Error processing this page]*\n\n" ) continue diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index cdfe40a..e1a0153 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -37,7 +37,7 @@ def is_content_valid(content: Optional[str]) -> bool: first_part = content_lower[:1000] for pattern in error_patterns: if pattern in first_part: - if pattern.startswith("error:") or pattern.startswith("exception:"): + if pattern in ("error:", "exception:"): return False idx = first_part.find(pattern) if idx < 100: diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt index 096efc5..82422b1 100644 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ b/wg21_paper_tracker/cloud_run_job/requirements.txt @@ -1,6 +1,6 @@ docling>=1.0.0 pdfplumber>=0.10.0 pdf2image>=1.16.0 -Pillow>=10.3.0 +Pillow>=12.1.1 requests>=2.31.0 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 0d1b903..fc45d7f 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -202,10 +202,13 @@ def handle(self, *args, **options): else: stats["papers_updated"] += 1 except IntegrityError as e: - # Duplicate (paper_id) or (paper_id, year): fetch existing and update + # Duplicate (paper_id, year): fetch existing by same key and update stats["papers_updated"] += 1 try: - paper = WG21Paper.objects.filter(paper_id=paper_id).first() + lookup_year = year if year is not None else 0 + paper = WG21Paper.objects.filter( + paper_id=paper_id, year=lookup_year + ).first() if paper is None: stats["skipped"] += 1 logger.error("Error for paper_id=%s: %s", paper_id, e) diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index 3945269..bfbb838 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -51,7 +51,8 @@ def handle(self, *args, **options): With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0 and GCP_PROJECT_ID and WG21_CLOUD_RUN_JOB_NAME are set. + total_new_papers > 0 and GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and + WG21_GCS_BUCKET are set (trigger is skipped if GCS upload is disabled). """ dry_run = options.get("dry_run", False) if dry_run: @@ -62,14 +63,15 @@ def handle(self, *args, **options): try: total_new_papers = run_tracker_pipeline() - logger.info("Downloaded and uploaded %d new papers.", total_new_papers) + logger.info("Processed %d new papers.", total_new_papers) if total_new_papers > 0: project_id = getattr(settings, "GCP_PROJECT_ID", None) location = getattr(settings, "GCP_LOCATION", "us-central1") job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) + bucket = getattr(settings, "WG21_GCS_BUCKET", None) - if project_id and job_name: + if project_id and job_name and bucket: try: trigger_cloud_run_job(project_id, location, job_name) logger.info( @@ -79,7 +81,8 @@ def handle(self, *args, **options): logger.error("Failed to trigger Cloud Run job: %s", e) else: logger.warning( - "GCP_PROJECT_ID not configured. Skipping Cloud Run trigger." + "Skipping Cloud Run trigger because GCP_PROJECT_ID, " + "WG21_CLOUD_RUN_JOB_NAME, or WG21_GCS_BUCKET is not configured." ) else: logger.info("No new papers found. Skipping Cloud Run job.") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 16a9bb6..e0b4f87 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -157,13 +157,36 @@ def run_tracker_pipeline() -> int: for m_info in new_mailings: mailing_date = m_info["mailing_date"] title = m_info["title"] - year = int(m_info["year"]) if m_info["year"] else None + # Normalize year once; use 0 when missing/empty/unparseable so you can fix later + year_raw = m_info.get("year") + if not year_raw or not str(year_raw).strip(): + year = 0 + logger.warning( + "Mailing %s: year missing or empty, using 0 (fix later).", + mailing_date, + ) + else: + try: + year = int(str(year_raw).strip()[:4]) + if year <= 0: + year = 0 + logger.warning( + "Mailing %s: year invalid, using 0 (fix later).", + mailing_date, + ) + except (ValueError, TypeError): + year = 0 + logger.warning( + "Mailing %s: year not parseable %r, using 0 (fix later).", + mailing_date, + year_raw, + ) # Create/get mailing in DB mailing_obj, _ = get_or_create_mailing(mailing_date, title) # Fetch papers for this mailing - papers = fetch_papers_for_mailing(year, mailing_date) + papers = fetch_papers_for_mailing(str(year), mailing_date) if not papers: logger.info( "Mailing %s: no papers found (anchor/table may be missing).", @@ -180,18 +203,17 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} return priorities.get(ext.lower(), 100) raw_dir = get_raw_dir(mailing_date) skipped_downloaded = 0 - year_val = year if year is not None else 0 for pid, p_list in papers_by_id.items(): # Skip only if this (paper_id, year) is already downloaded if WG21Paper.objects.filter( paper_id=pid, - year=year_val, + year=year, is_downloaded=True, ).exists(): skipped_downloaded += 1 @@ -220,8 +242,10 @@ def format_priority(ext: str) -> int: gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) else: - # If no GCS, simulate success so DB is updated - uploaded = True + logger.warning( + "WG21_GCS_BUCKET is not configured; leaving %s as not downloaded.", + pid, + ) # Persist DB doc_date_str = best_paper["document_date"] diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py index 5d9a1ac..9b4ee7e 100644 --- a/wg21_paper_tracker/tests/test_models.py +++ b/wg21_paper_tracker/tests/test_models.py @@ -3,6 +3,7 @@ from datetime import date import pytest +from django.db import IntegrityError, transaction from wg21_paper_tracker.models import WG21Mailing, WG21Paper @@ -56,7 +57,7 @@ def test_wg21_mailing_ordering(): @pytest.mark.django_db def test_wg21_paper_unique_together_paper_id_year(): - """WG21Paper allows same paper_id with different year.""" + """WG21Paper allows same paper_id with different year; rejects duplicate (paper_id, year).""" m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1") m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2") WG21Paper.objects.create( @@ -66,6 +67,15 @@ def test_wg21_paper_unique_together_paper_id_year(): mailing=m1, year=2024, ) + with pytest.raises(IntegrityError): + with transaction.atomic(): + WG21Paper.objects.create( + paper_id="sd-1", + url="https://example.com/dup.pdf", + title="T1 dup", + mailing=m1, + year=2024, + ) p2 = WG21Paper.objects.create( paper_id="sd-1", url="https://example.com/2.pdf", diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index 592ceec..4756ffd 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -155,7 +155,12 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): "wg21_paper_tracker.pipeline._download_file", return_value=True ): with patch( - "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", None + "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", + "test-bucket", ): - n = run_tracker_pipeline() + with patch( + "wg21_paper_tracker.pipeline._upload_to_gcs", + return_value=True, + ): + n = run_tracker_pipeline() assert n == 1 diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 6ec6a00..023f15c 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -79,9 +79,14 @@ def test_get_or_create_paper_calls_author_profile_for_each_author( """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author.""" from unittest.mock import MagicMock - profile = MagicMock() - profile.pk = 1 - mock_profile.return_value = (profile, True) + alice_profile = MagicMock() + alice_profile.pk = 1 + bob_profile = MagicMock() + bob_profile.pk = 2 + mock_profile.side_effect = [ + (alice_profile, True), + (bob_profile, True), + ] mock_get_or_create_paper_author.return_value = (MagicMock(), True) mailing, _ = get_or_create_mailing("2025-01", "Title") @@ -99,8 +104,8 @@ def test_get_or_create_paper_calls_author_profile_for_each_author( mock_profile.assert_any_call("Alice", email=None) mock_profile.assert_any_call("Bob", email=None) assert mock_get_or_create_paper_author.call_count == 2 - mock_get_or_create_paper_author.assert_any_call(paper, profile, 1) - mock_get_or_create_paper_author.assert_any_call(paper, profile, 2) + mock_get_or_create_paper_author.assert_any_call(paper, alice_profile, 1) + mock_get_or_create_paper_author.assert_any_call(paper, bob_profile, 2) @pytest.mark.django_db @@ -151,7 +156,7 @@ def test_get_or_create_paper_gets_existing_and_updates(db): @pytest.mark.django_db def test_get_or_create_paper_year_none_stored_as_zero(db): - """get_or_create_paper with year=None stores 0 for unknown year.""" + """get_or_create_paper with year=None stores 0 so records can be updated later.""" mailing, _ = get_or_create_mailing("2025-01", "Title") paper, _ = get_or_create_paper( paper_id="n5034", From e3e91c85b9d550e0e548f842628bafcfd27a6b70 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 16:52:31 -0700 Subject: [PATCH 06/20] =?UTF-8?q?Fix:=20WG21=20=E2=80=93=20optional=20Clou?= =?UTF-8?q?d=20Run,=20per-blob=20isolation,=20PDF=20priority,=20year=3D0?= =?UTF-8?q?=20promotion,=20logging=20#24?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/settings.py | 5 +- docs/operations/WG21_Cloud_Run.md | 3 +- wg21_paper_tracker/cloud_run_job/main.py | 56 +++++++------ wg21_paper_tracker/fetcher.py | 4 +- .../commands/run_wg21_paper_tracker.py | 19 +++-- wg21_paper_tracker/pipeline.py | 8 +- wg21_paper_tracker/services.py | 82 ++++++++++++++----- 7 files changed, 112 insertions(+), 65 deletions(-) diff --git a/config/settings.py b/config/settings.py index 925ebe5..17f549a 100644 --- a/config/settings.py +++ b/config/settings.py @@ -220,9 +220,8 @@ WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = ( - env("WG21_CLOUD_RUN_JOB_NAME", default="wg21-convert") or "" -).strip() +WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="") or "").strip() +WG21_CLOUD_RUN_ENABLED = env.bool("WG21_CLOUD_RUN_ENABLED", default=False) # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index 7840bd1..e3b1338 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -11,7 +11,8 @@ Create a GCS bucket (e.g., `wg21-data-collector`). Ensure your Django app has the following environment variables configured: - `WG21_GCS_BUCKET`: The name of the GCS bucket. - `GCP_PROJECT_ID`: Your GCP project ID. -- `WG21_CLOUD_RUN_JOB_NAME`: (Optional, defaults to `wg21-convert`) The name of the deployed Cloud Run job. +- `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. +- `WG21_CLOUD_RUN_ENABLED`: (Optional, default `false`) Set to `true` to allow the tracker to trigger the Cloud Run conversion job when new papers are uploaded. Keeps the trigger optional even when project and bucket are set. - `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. ## 2. Build and Push the Docker Image diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index e1a0153..e2f9781 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -11,7 +11,8 @@ from converters.openai_converter import convert_with_openai logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) @@ -73,7 +74,7 @@ def main(): client = storage.Client() bucket = client.bucket(bucket_name) - raw_prefix = "raw/wg21_papers/" + raw_prefix = "raw/wg21_paper_tracker/" converted_prefix = "converted/wg21_papers/" blobs = client.list_blobs(bucket, prefix=raw_prefix) @@ -83,31 +84,34 @@ def main(): if not blob.name.lower().endswith(".pdf"): continue - # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix) :] - md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" - md_blob_name = f"{converted_prefix}{md_relative_path}" - - md_blob = bucket.blob(md_blob_name) - if md_blob.exists(): - logger.info("Skipping %s, MD already exists.", blob.name) - continue - local_pdf_path = Path(tmpdir) / "temp.pdf" - logger.info("Downloading %s to process...", blob.name) - blob.download_to_filename(str(local_pdf_path)) - - logger.info("Converting %s...", blob.name) - md_content = convert_pdf_to_md(local_pdf_path) - - if md_content: - md_blob.upload_from_string(md_content, content_type="text/markdown") - logger.info("Successfully converted and uploaded %s", md_blob_name) - else: - logger.error("Failed to convert %s", blob.name) - - if local_pdf_path.exists(): - local_pdf_path.unlink() + try: + # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + relative_path = blob.name[len(raw_prefix) :] + md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" + md_blob_name = f"{converted_prefix}{md_relative_path}" + + md_blob = bucket.blob(md_blob_name) + if md_blob.exists(): + logger.info("Skipping %s, MD already exists.", blob.name) + continue + + logger.info("Downloading %s to process...", blob.name) + blob.download_to_filename(str(local_pdf_path)) + + logger.info("Converting %s...", blob.name) + md_content = convert_pdf_to_md(local_pdf_path) + + if md_content: + md_blob.upload_from_string(md_content, content_type="text/markdown") + logger.info("Successfully converted and uploaded %s", md_blob_name) + else: + logger.error("Failed to convert %s", blob.name) + except Exception: + logger.exception("Failed processing %s", blob.name) + finally: + if local_pdf_path.exists(): + local_pdf_path.unlink() if __name__ == "__main__": diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index 4e44bd0..a5d0cc4 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -30,7 +30,7 @@ def fetch_all_mailings() -> list[dict]: response = requests.get(f"{BASE_URL}/", timeout=30) response.raise_for_status() except requests.RequestException: - logger.error("Failed to fetch WG21 index.") + logger.exception("Failed to fetch WG21 index.") return [] # The mailings are listed in a markdown-like syntax or links @@ -66,7 +66,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: response = requests.get(url, timeout=30) response.raise_for_status() except requests.RequestException: - logger.error("Failed to fetch year page %s.", year) + logger.exception("Failed to fetch year page %s.", year) return [] soup = BeautifulSoup(response.text, "html.parser") diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index bfbb838..b1885af 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -51,8 +51,8 @@ def handle(self, *args, **options): With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0 and GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and - WG21_GCS_BUCKET are set (trigger is skipped if GCS upload is disabled). + total_new_papers > 0, WG21_CLOUD_RUN_ENABLED is True, and + GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and WG21_GCS_BUCKET are set. """ dry_run = options.get("dry_run", False) if dry_run: @@ -70,19 +70,24 @@ def handle(self, *args, **options): location = getattr(settings, "GCP_LOCATION", "us-central1") job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) bucket = getattr(settings, "WG21_GCS_BUCKET", None) + cloud_run_enabled = getattr(settings, "WG21_CLOUD_RUN_ENABLED", False) - if project_id and job_name and bucket: + if project_id and job_name and bucket and cloud_run_enabled: try: trigger_cloud_run_job(project_id, location, job_name) logger.info( "Successfully triggered Cloud Run job %s.", job_name ) - except Exception as e: - logger.error("Failed to trigger Cloud Run job: %s", e) + except Exception: + logger.exception( + "Failed to trigger Cloud Run job %s.", job_name + ) + raise else: logger.warning( - "Skipping Cloud Run trigger because GCP_PROJECT_ID, " - "WG21_CLOUD_RUN_JOB_NAME, or WG21_GCS_BUCKET is not configured." + "Skipping Cloud Run trigger: set WG21_CLOUD_RUN_ENABLED=True " + "and configure GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and " + "WG21_GCS_BUCKET to enable." ) else: logger.info("No new papers found. Skipping Cloud Run job.") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index e0b4f87..8baa910 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -203,7 +203,8 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} + # Prefer PDF (Cloud Run converts PDFs); then html, adoc, ps + priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} return priorities.get(ext.lower(), 100) raw_dir = get_raw_dir(mailing_date) @@ -219,9 +220,8 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue - # Pick the best format - p_list.sort(key=lambda x: format_priority(x["type"])) - best_paper = p_list[0] + # Pick the best format (PDF first for conversion) + best_paper = min(p_list, key=lambda x: format_priority(x["type"])) raw_filename = (best_paper.get("filename") or "").strip() filename = Path(raw_filename).name diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 4328711..f773b75 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -15,6 +15,16 @@ from cppa_user_tracker.models import WG21PaperAuthorProfile +def _normalize_year(year: int | str | None) -> int: + """Return a 4-digit year as int, or 0 if missing/invalid.""" + if year is None: + return 0 + if isinstance(year, int): + return year if 0 < year <= 9999 else 0 + s = str(year).strip()[:4] + return int(s) if s.isdigit() else 0 + + @transaction.atomic def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]: mailing, created = WG21Mailing.objects.get_or_create( @@ -39,23 +49,9 @@ def get_or_create_paper( year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() - year_val = 0 - if year: - s = (year if isinstance(year, str) else str(year)).strip()[:4] - if s.isdigit(): - year_val = int(s) - paper, created = WG21Paper.objects.get_or_create( - paper_id=paper_id, - year=year_val, - defaults={ - "url": url, - "title": title, - "document_date": document_date, - "mailing": mailing, - "subgroup": subgroup, - }, - ) - if not created: + year_val = _normalize_year(year) + + def _update_paper(paper: WG21Paper) -> bool: updated = False if paper.url != url: paper.url = url @@ -77,6 +73,52 @@ def get_or_create_paper( updated = True if updated: paper.save() + return updated + + if year_val > 0: + # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year + paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() + if paper: + _update_paper(paper) + created = False + else: + placeholder = WG21Paper.objects.filter(paper_id=paper_id, year=0).first() + if placeholder: + placeholder.url = url + placeholder.title = title + placeholder.document_date = document_date + placeholder.mailing = mailing + placeholder.subgroup = subgroup + placeholder.year = year_val + placeholder.save() + paper = placeholder + created = False + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=year_val, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=0, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) + if not created: + _update_paper(paper) if author_names: emails = author_emails or [] @@ -109,11 +151,7 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() - year_val = 0 - if year is not None: - s = (year if isinstance(year, str) else str(year)).strip()[:4] - if s.isdigit(): - year_val = int(s) + year_val = _normalize_year(year) WG21Paper.objects.filter( paper_id=paper_id, year=year_val, From 2159f5348874fc423fec61aaf67d14d25b2475be Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 17:43:13 -0700 Subject: [PATCH 07/20] wg21: fix author_alias migration default, fail job when bucket unset, use raw/wg21_paper_tracker/YYYY// #24 --- ...006_wg21paperauthorprofile_author_alias.py | 2 +- docs/operations/WG21_Cloud_Run.md | 3 ++- wg21_paper_tracker/cloud_run_job/main.py | 4 +-- wg21_paper_tracker/pipeline.py | 9 ++++--- wg21_paper_tracker/tests/test_workspace.py | 25 ++++++++++--------- wg21_paper_tracker/workspace.py | 9 ++++--- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py index 674176a..1660763 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py +++ b/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py @@ -14,6 +14,6 @@ class Migration(migrations.Migration): model_name="wg21paperauthorprofile", name="author_alias", field=models.CharField(blank=True, db_index=True, default="", max_length=255), - preserve_default=True, + preserve_default=False, ), ] diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index e3b1338..cabd828 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -9,6 +9,7 @@ The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job vi Create a GCS bucket (e.g., `wg21-data-collector`). Ensure your Django app has the following environment variables configured: + - `WG21_GCS_BUCKET`: The name of the GCS bucket. - `GCP_PROJECT_ID`: Your GCP project ID. - `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. @@ -59,6 +60,6 @@ Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manage 1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. 2. It checks the WG21 site for new mailings. -3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_papers//`. +3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. 4. It calls the Cloud Run API to execute `wg21-convert`. 5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py index e2f9781..61c57dc 100644 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ b/wg21_paper_tracker/cloud_run_job/main.py @@ -69,7 +69,7 @@ def main(): bucket_name = os.getenv("WG21_GCS_BUCKET") if not bucket_name: logger.error("WG21_GCS_BUCKET env var not set.") - return + raise RuntimeError("WG21_GCS_BUCKET env var not set.") client = storage.Client() bucket = client.bucket(bucket_name) @@ -86,7 +86,7 @@ def main(): local_pdf_path = Path(tmpdir) / "temp.pdf" try: - # e.g. raw/wg21_papers/2025-02/p0149r1.pdf -> 2025-02/p0149r1.pdf + # e.g. raw/wg21_paper_tracker/2025/2025-02/p0149r1.pdf -> 2025/2025-02/p0149r1.pdf relative_path = blob.name[len(raw_prefix) :] md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" md_blob_name = f"{converted_prefix}{md_relative_path}" diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 8baa910..639d400 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -203,11 +203,10 @@ def run_tracker_pipeline() -> int: papers_by_id[pid].append(p) def format_priority(ext: str) -> int: - # Prefer PDF (Cloud Run converts PDFs); then html, adoc, ps - priorities = {"pdf": 1, "html": 2, "adoc": 3, "ps": 4} + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} return priorities.get(ext.lower(), 100) - raw_dir = get_raw_dir(mailing_date) + raw_dir = get_raw_dir(mailing_date, year) skipped_downloaded = 0 for pid, p_list in papers_by_id.items(): @@ -239,7 +238,9 @@ def format_priority(ext: str) -> int: if _download_file(url, local_path): uploaded = False if bucket_name: - gcs_path = f"raw/wg21_papers/{mailing_date}/{filename}" + gcs_path = ( + f"raw/wg21_paper_tracker/{year}/{mailing_date}/{filename}" + ) uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) else: logger.warning( diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 8c8365e..4e50899 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -41,7 +41,7 @@ def test_get_workspace_root_calls_get_workspace_path_with_slug(): def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): - """get_raw_dir returns raw/wg21_paper_tracker//.""" + """get_raw_dir returns raw/wg21_paper_tracker//.""" with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: raw_root = mock_workspace_path / "raw_wg21_paper_tracker" raw_root.mkdir(parents=True, exist_ok=True) @@ -49,8 +49,8 @@ def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", "raw/wg21_paper_tracker": raw_root, }[slug] - path = get_raw_dir("2025-01") - assert path == raw_root / "2025-01" + path = get_raw_dir("2025-01", 2025) + assert path == raw_root / "2025" / "2025-01" assert path.is_dir() @@ -62,8 +62,9 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): m.side_effect = lambda slug: ( raw_root if "raw" in slug else (mock_workspace_path / "app") ) - path = get_raw_dir("2026-02") + path = get_raw_dir("2026-02", 2026) assert path.exists() + assert path.parent.name == "2026" assert path.name == "2026-02" @@ -73,8 +74,8 @@ def test_get_raw_dir_idempotent(mock_workspace_path): raw_root = mock_workspace_path / "raw" raw_root.mkdir(parents=True, exist_ok=True) m.side_effect = lambda slug: raw_root - p1 = get_raw_dir("2025-01") - p2 = get_raw_dir("2025-01") + p1 = get_raw_dir("2025-01", 2025) + p2 = get_raw_dir("2025-01", 2025) assert p1 == p2 assert p1.parent == p2.parent @@ -82,14 +83,14 @@ def test_get_raw_dir_idempotent(mock_workspace_path): def test_get_raw_dir_rejects_invalid_mailing_date(): """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.).""" with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("../../tmp") + get_raw_dir("../../tmp", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025") + get_raw_dir("2025", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-1") + get_raw_dir("2025-1", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-13") + get_raw_dir("2025-13", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("2025-00") + get_raw_dir("2025-00", 2025) with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"): - get_raw_dir("") + get_raw_dir("", 2025) diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 89b853b..04542fe 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -17,11 +17,14 @@ def get_workspace_root() -> Path: return get_workspace_path(_APP_SLUG) -def get_raw_dir(mailing_date: str) -> Path: - """Return workspace/raw/wg21_paper_tracker//; creates if missing.""" +def get_raw_dir(mailing_date: str | None, year: int) -> Path: + """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" if not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") raw_root = get_workspace_path(_RAW_APP_SLUG) - path = raw_root / mailing_date + if mailing_date: + path = raw_root / str(year) / mailing_date + else: + path = raw_root / str(year) path.mkdir(parents=True, exist_ok=True) return path From be392a0403d38db9c31d0e8ac635a9ca398b0d9b Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:11:01 -0700 Subject: [PATCH 08/20] wg21: honor settings.RAW_DIR for raw paper storage #24 --- dev-24error: | 0 wg21_paper_tracker/fetcher.py | 49 +++++++++++++++---- .../commands/import_wg21_metadata_from_csv.py | 5 +- wg21_paper_tracker/tests/test_commands.py | 19 +++++++ wg21_paper_tracker/tests/test_fetcher.py | 24 +++++++++ wg21_paper_tracker/tests/test_workspace.py | 28 ++++------- wg21_paper_tracker/workspace.py | 8 ++- 7 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 dev-24error: create mode 100644 wg21_paper_tracker/tests/test_commands.py diff --git a/dev-24error: b/dev-24error: new file mode 100644 index 0000000..e69de29 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index a5d0cc4..2c6ad03 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -5,9 +5,11 @@ import re import urllib.parse +from typing import Optional import requests from bs4 import BeautifulSoup +from bs4.element import Tag import logging @@ -15,6 +17,34 @@ BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" +_MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$") + + +def _find_table_in_section(anchor) -> Optional[Tag]: + """ + Find the first that belongs to the current mailing section. + Stops at the next mailing anchor (id/name matching mailingYYYY-MM) so we + do not attribute another mailing's table to this section. + """ + if not anchor: + return None + anchor_id = anchor.get("id") or anchor.get("name") or "" + if not _MAILING_ANCHOR_RE.match(anchor_id): + return None + for elem in anchor.next_elements: + if not hasattr(elem, "name"): # NavigableString, etc. + continue + if elem is anchor: + continue + if elem.name == "table": + return elem + if not hasattr(elem, "get"): # e.g. NavigableString + continue + next_id = elem.get("id") or elem.get("name") or "" + if next_id and _MAILING_ANCHOR_RE.match(next_id) and next_id != anchor_id: + return None # next section start; no table in this section + return None + def fetch_all_mailings() -> list[dict]: """ @@ -76,7 +106,7 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: logger.warning("Anchor %s not found on %s", anchor_id, url) return [] - table = anchor.find_next("table") + table = _find_table_in_section(anchor) if not table: logger.warning("No table found after anchor %s", anchor_id) return [] @@ -96,14 +126,15 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: href = link.get("href", "") match = paper_pattern.search(href) if match: - if href.startswith("../"): - paper_url = urllib.parse.urljoin(url, href) - elif href.startswith("/"): - paper_url = urllib.parse.urljoin(BASE_URL, href) - elif not href.startswith("http"): - paper_url = urllib.parse.urljoin(url, href) - else: - paper_url = href + paper_url = urllib.parse.urljoin(url, href) + parsed = urllib.parse.urlparse(paper_url) + base = urllib.parse.urlparse(BASE_URL) + if ( + parsed.scheme not in ("https", "http") + or parsed.netloc != base.netloc + ): + logger.warning("Skipping off-origin paper URL %s", paper_url) + continue paper_id = match.group(1).lower() file_ext = match.group(2).lower() diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index fc45d7f..3d63734 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -12,7 +12,7 @@ import re from pathlib import Path -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from django.db import IntegrityError from django.utils.dateparse import parse_date @@ -118,8 +118,7 @@ def handle(self, *args, **options): dry_run = options["dry_run"] if not csv_path.exists(): - logger.error("File not found: %s", csv_path) - return + raise CommandError(f"File not found: {csv_path}") if dry_run: logger.info("Dry run: no DB writes.") diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py new file mode 100644 index 0000000..f9c9d7d --- /dev/null +++ b/wg21_paper_tracker/tests/test_commands.py @@ -0,0 +1,19 @@ +"""Tests for wg21_paper_tracker management commands.""" + +import pytest +from pathlib import Path + +from django.core.management import call_command +from django.core.management.base import CommandError + + +CMD_NAME = "import_wg21_metadata_from_csv" + + +def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): + """Command raises CommandError when CSV file does not exist.""" + csv_path = tmp_path / "nonexistent.csv" + assert not csv_path.exists() + + with pytest.raises(CommandError, match=r"File not found:"): + call_command(CMD_NAME, f"--csv-file={csv_path}") diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 8b2ffec..3b903fb 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -171,6 +171,30 @@ def test_fetch_papers_for_mailing_returns_empty_when_no_table(): assert result == [] +def test_fetch_papers_for_mailing_does_not_use_next_mailings_table(): + """First mailing with no table returns []; second mailing's table is not used.""" + html = """ + +

2025-02

+

No papers this month.

+

2025-01

+
+ +
p1234r1.pdfPaperA. Author2025-01-10SG1
+ + """ + with patch("wg21_paper_tracker.fetcher.requests.get") as m: + resp = MagicMock() + resp.text = html + resp.raise_for_status = MagicMock() + m.return_value = resp + first = fetch_papers_for_mailing("2025", "2025-02") + second = fetch_papers_for_mailing("2025", "2025-01") + assert first == [], "2025-02 has no table; must not attribute 2025-01's table" + assert len(second) == 1 + assert second[0]["paper_id"] == "p1234r1" + + def test_fetch_papers_for_mailing_calls_year_url(): """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout.""" with patch("wg21_paper_tracker.fetcher.requests.get") as m: diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py index 4e50899..09986df 100644 --- a/wg21_paper_tracker/tests/test_workspace.py +++ b/wg21_paper_tracker/tests/test_workspace.py @@ -41,27 +41,19 @@ def test_get_workspace_root_calls_get_workspace_path_with_slug(): def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path): - """get_raw_dir returns raw/wg21_paper_tracker//.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw_wg21_paper_tracker" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: { - "wg21_paper_tracker": mock_workspace_path / "wg21_paper_tracker", - "raw/wg21_paper_tracker": raw_root, - }[slug] + """get_raw_dir returns RAW_DIR/wg21_paper_tracker///.""" + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path path = get_raw_dir("2025-01", 2025) - assert path == raw_root / "2025" / "2025-01" + expected = mock_workspace_path / "wg21_paper_tracker" / "2025" / "2025-01" + assert path == expected assert path.is_dir() def test_get_raw_dir_creates_parents(mock_workspace_path): """get_raw_dir creates parent directories.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw_app" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: ( - raw_root if "raw" in slug else (mock_workspace_path / "app") - ) + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path path = get_raw_dir("2026-02", 2026) assert path.exists() assert path.parent.name == "2026" @@ -70,10 +62,8 @@ def test_get_raw_dir_creates_parents(mock_workspace_path): def test_get_raw_dir_idempotent(mock_workspace_path): """get_raw_dir can be called twice for same mailing_date without error.""" - with patch("wg21_paper_tracker.workspace.get_workspace_path") as m: - raw_root = mock_workspace_path / "raw" - raw_root.mkdir(parents=True, exist_ok=True) - m.side_effect = lambda slug: raw_root + with patch("wg21_paper_tracker.workspace.settings") as mock_settings: + mock_settings.RAW_DIR = mock_workspace_path p1 = get_raw_dir("2025-01", 2025) p2 = get_raw_dir("2025-01", 2025) assert p1 == p2 diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 04542fe..1934ed8 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -6,6 +6,8 @@ import re from pathlib import Path +from django.conf import settings + from config.workspace import get_workspace_path _APP_SLUG = "wg21_paper_tracker" @@ -21,7 +23,11 @@ def get_raw_dir(mailing_date: str | None, year: int) -> Path: """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" if not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") - raw_root = get_workspace_path(_RAW_APP_SLUG) + if getattr(settings, "RAW_DIR", None): + raw_root = Path(settings.RAW_DIR) / _APP_SLUG + else: + raw_root = get_workspace_path(_RAW_APP_SLUG) + raw_root.mkdir(parents=True, exist_ok=True) if mailing_date: path = raw_root / str(year) / mailing_date else: From 005278a156cc35e33e9e7da1f3f2a7de3050b740 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:12:27 -0700 Subject: [PATCH 09/20] Fix: lint/format error #24 --- wg21_paper_tracker/tests/test_commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py index f9c9d7d..34a52e9 100644 --- a/wg21_paper_tracker/tests/test_commands.py +++ b/wg21_paper_tracker/tests/test_commands.py @@ -1,7 +1,6 @@ """Tests for wg21_paper_tracker management commands.""" import pytest -from pathlib import Path from django.core.management import call_command from django.core.management.base import CommandError From 35476526f60d32bb3cbf5b6d3a162b9edaaddc17 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 18:57:32 -0700 Subject: [PATCH 10/20] fix(openai_converter): use neutral page placeholder for failed pages #24 --- cppa_user_tracker/services.py | 18 ++++++++-- .../converters/openai_converter.py | 7 ++-- .../converters/pdfplumber_converter.py | 2 +- .../commands/import_wg21_metadata_from_csv.py | 2 +- wg21_paper_tracker/pipeline.py | 34 ++++++++++--------- wg21_paper_tracker/tests/test_pipeline.py | 5 ++- wg21_paper_tracker/workspace.py | 2 +- 7 files changed, 43 insertions(+), 27 deletions(-) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 146f778..152bbc2 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -381,11 +381,25 @@ def get_or_create_wg21_paper_author_profile( return profile, True if len(candidates) == 1: - return candidates[0], False + profile = candidates[0] + if email_val and not profile.emails.filter(email=email_val).exists(): + add_email( + profile, + email_val, + is_primary=not profile.emails.filter(is_active=True).exists(), + ) + return profile, False # Two or more: disambiguate by email if provided if email_val: for p in candidates: if p.emails.filter(email=email_val).exists(): return p, False - return candidates[0], False + profile = candidates[0] + if email_val and not profile.emails.filter(email=email_val).exists(): + add_email( + profile, + email_val, + is_primary=not profile.emails.filter(is_active=True).exists(), + ) + return profile, False diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 078e984..ae17f6e 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -234,15 +234,16 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: else: logger.warning(f"Failed to convert page {page_num} with OpenAI") markdown_parts.append( - f"## Page {page_num}\n\n*[Conversion failed for this page]*\n\n" + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) except Exception as e: logger.error( - f"Error processing page {page_num}: {str(e)}", exc_info=True + f"Error processing page {page_num}: {str(e)}", + exc_info=True, ) markdown_parts.append( - f"## Page {page_num}\n\n*[Error processing this page]*\n\n" + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) continue diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py index 6329c5a..fb36c4e 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py @@ -62,7 +62,7 @@ def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: markdown_parts.append( "| " + " | ".join( - str(cell) if cell else "" + "" if cell is None else str(cell) for cell in row ) + " |\n" diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 3d63734..365a008 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -202,7 +202,6 @@ def handle(self, *args, **options): stats["papers_updated"] += 1 except IntegrityError as e: # Duplicate (paper_id, year): fetch existing by same key and update - stats["papers_updated"] += 1 try: lookup_year = year if year is not None else 0 paper = WG21Paper.objects.filter( @@ -220,6 +219,7 @@ def handle(self, *args, **options): if year is not None: paper.year = year paper.save() + stats["papers_updated"] += 1 if author_names: from cppa_user_tracker.services import ( get_or_create_wg21_paper_author_profile, diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 639d400..966782e 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -126,21 +126,23 @@ def run_tracker_pipeline() -> int: # Filter newer mailings new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] - # Also check the latest one again just in case new papers were added - if latest_mailing and latest_mailing.mailing_date not in [ - m["mailing_date"] for m in new_mailings - ]: - # We re-check the most recent mailing from the DB to catch late additions - # Find the matching dict from all_mailings - current_m = next( - ( - m - for m in all_mailings - if m["mailing_date"] == latest_mailing.mailing_date - ), - None, + # Requeue incomplete mailings so transient failures get retried (not just the latest) + retry_dates = set( + WG21Mailing.objects.filter(papers__isnull=True).values_list( + "mailing_date", flat=True ) - if current_m: + ) + retry_dates.update( + WG21Mailing.objects.filter(papers__is_downloaded=False).values_list( + "mailing_date", flat=True + ) + ) + if latest_mailing: + retry_dates.add(latest_mailing.mailing_date) + for current_m in all_mailings: + if current_m["mailing_date"] in retry_dates and current_m[ + "mailing_date" + ] not in [x["mailing_date"] for x in new_mailings]: new_mailings.append(current_m) # Sort chronologically (oldest to newest) @@ -194,7 +196,7 @@ def run_tracker_pipeline() -> int: ) continue - # Group papers by ID to prioritize PDF over HTML (paper_id is case-insensitive) + # Group papers by ID so we can choose the preferred source format per paper. papers_by_id = {} for p in papers: pid = (p["paper_id"] or "").strip().lower() @@ -219,7 +221,7 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue - # Pick the best format (PDF first for conversion) + # Pick the preferred format: adoc > html > ps > pdf. best_paper = min(p_list, key=lambda x: format_priority(x["type"])) raw_filename = (best_paper.get("filename") or "").strip() diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index 4756ffd..ad56a29 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,6 +1,5 @@ """Tests for wg21_paper_tracker.pipeline.""" -from pathlib import Path from unittest.mock import patch, MagicMock import pytest @@ -47,10 +46,10 @@ def test_download_file_success_binary(tmp_path): assert filepath.read_bytes() == b"\x25\x50\x44\x46" -def test_download_file_uses_timeout(): +def test_download_file_uses_timeout(tmp_path): """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" url = "https://example.com/f" - filepath = Path("/tmp/out") + filepath = tmp_path / "out" resp = MagicMock() resp.raise_for_status = MagicMock() resp.headers = {"content-type": "text/plain"} diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py index 1934ed8..62ec55e 100644 --- a/wg21_paper_tracker/workspace.py +++ b/wg21_paper_tracker/workspace.py @@ -21,7 +21,7 @@ def get_workspace_root() -> Path: def get_raw_dir(mailing_date: str | None, year: int) -> Path: """Return workspace/raw/wg21_paper_tracker///; creates if missing.""" - if not _MAILING_DATE_RE.fullmatch(mailing_date): + if mailing_date is not None and not _MAILING_DATE_RE.fullmatch(mailing_date): raise ValueError("mailing_date must be in YYYY-MM format") if getattr(settings, "RAW_DIR", None): raw_root = Path(settings.RAW_DIR) / _APP_SLUG From 7403033971fcccf8b7c33d7b51626bf49af053e6 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 21:01:37 -0700 Subject: [PATCH 11/20] Fix: doc and converter fixes #24 --- docs/operations/WG21_Cloud_Run.md | 6 +- docs/service_api/cppa_user_tracker.md | 2 +- .../converters/openai_converter.py | 104 +++++++++++------- wg21_paper_tracker/pipeline.py | 9 +- 4 files changed, 74 insertions(+), 47 deletions(-) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md index cabd828..b1caccf 100644 --- a/docs/operations/WG21_Cloud_Run.md +++ b/docs/operations/WG21_Cloud_Run.md @@ -2,7 +2,7 @@ The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. -The Django tracker (`run_wg21_paper_tracker`) automatically triggers this job via the Google Cloud Run API when new papers are downloaded. +When `WG21_CLOUD_RUN_ENABLED=true` and `WG21_CLOUD_RUN_JOB_NAME` is set, the Django tracker (`run_wg21_paper_tracker`) triggers the configured Cloud Run job after uploading new papers. ## 1. Setup Google Cloud Storage @@ -61,5 +61,5 @@ Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manage 1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. 2. It checks the WG21 site for new mailings. 3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. -4. It calls the Cloud Run API to execute `wg21-convert`. -5. The Cloud Run Job spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. +4. If Cloud Run triggering is enabled, it calls the configured Cloud Run job. +5. The Cloud Run Job then spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index 4ca0adb..bc89dbd 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -45,7 +45,7 @@ | Function | Parameter types | Return type | Description | | -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name. If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist, and email is provided, returns the one with that email if any; otherwise returns the first. Use this when linking paper authors so that same name + same email link to the same profile. | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, or multiple exist and one matches the email, returns that profile; otherwise returns the first. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | --- diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index ae17f6e..8952f15 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -8,6 +8,8 @@ import io import logging import os +import shutil +import tempfile from pathlib import Path from typing import Optional @@ -34,32 +36,46 @@ ) -def pdf_to_images(pdf_path: Path) -> list[Image.Image]: +def pdf_to_images(pdf_path: Path) -> tuple[Optional[Path], list[Path]]: """ - Convert PDF pages to images. + Convert PDF pages to image files on disk (one per page) to avoid loading all into memory. - Note: pdf2image should automatically handle PDF rotation metadata, - but we also apply additional rotation correction in correct_image_rotation(). + Writes images into a temporary directory and returns (tmp_dir, paths). Caller must process + each path and then remove tmp_dir (e.g. shutil.rmtree) so only the current page is resident. + + Note: pdf2image should automatically handle PDF rotation metadata; we also apply + additional rotation correction in correct_image_rotation() when loading each image. Args: pdf_path: Path to the PDF file. Returns: - List of PIL Image objects. + (tmp_dir, list of image paths). tmp_dir is None on failure or if pdf2image unavailable; + paths are in page order. Caller must cleanup tmp_dir when not None. """ if not PDF2IMAGE_AVAILABLE: logger.error("pdf2image is not available") - return [] + return (None, []) try: logger.info(f"Converting PDF to images: {pdf_path.name}") - # pdf2image should respect PDF rotation, but we'll also check EXIF data - images = convert_from_path(pdf_path, dpi=200) - logger.info(f"Converted {len(images)} pages to images") - return images + tmp_dir = Path(tempfile.mkdtemp(prefix="wg21_pdf_")) + try: + path_strs = convert_from_path( + pdf_path, + dpi=200, + paths_only=True, + output_folder=str(tmp_dir), + ) + paths = [Path(p) for p in path_strs] + logger.info(f"Converted {len(paths)} pages to images") + return (tmp_dir, paths) + except Exception: + shutil.rmtree(tmp_dir, ignore_errors=True) + raise except Exception as e: logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) - return [] + return (None, []) def correct_image_rotation(image: Image.Image) -> Image.Image: @@ -206,46 +222,50 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: try: logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - # Convert PDF to images - images = pdf_to_images(pdf_path) - if not images: + # Convert PDF to image files on disk (avoids loading all pages into memory) + tmp_dir, paths = pdf_to_images(pdf_path) + if not paths: logger.error(f"Failed to convert PDF to images: {pdf_path.name}") return None - total_pages = len(images) + total_pages = len(paths) markdown_parts = [] successful_pages = 0 - # Process each page - for page_num, image in enumerate(images, 1): - try: - # Convert image to base64 - image_base64 = image_to_base64(image) - - # Convert page with OpenAI - page_markdown = convert_page_with_openai( - image_base64, page_num, total_pages - ) - - if page_markdown: - markdown_parts.append(page_markdown) - markdown_parts.append("\n\n") - successful_pages += 1 - else: - logger.warning(f"Failed to convert page {page_num} with OpenAI") + try: + # Process each page: load one image at a time, convert, then move on + for page_num, image_path in enumerate(paths, 1): + try: + with Image.open(image_path) as img: + img.load() + image_base64 = image_to_base64(img) + # Convert page with OpenAI + page_markdown = convert_page_with_openai( + image_base64, page_num, total_pages + ) + + if page_markdown: + markdown_parts.append(page_markdown) + markdown_parts.append("\n\n") + successful_pages += 1 + else: + logger.warning(f"Failed to convert page {page_num} with OpenAI") + markdown_parts.append( + f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" + ) + + except Exception as e: + logger.error( + f"Error processing page {page_num}: {str(e)}", + exc_info=True, + ) markdown_parts.append( f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" ) - - except Exception as e: - logger.error( - f"Error processing page {page_num}: {str(e)}", - exc_info=True, - ) - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - continue + continue + finally: + if tmp_dir is not None: + shutil.rmtree(tmp_dir, ignore_errors=True) markdown_content = "".join(markdown_parts) diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 966782e..d7f96be 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -199,7 +199,14 @@ def run_tracker_pipeline() -> int: # Group papers by ID so we can choose the preferred source format per paper. papers_by_id = {} for p in papers: - pid = (p["paper_id"] or "").strip().lower() + pid = (p.get("paper_id") or "").strip().lower() + if not pid: + logger.warning( + "Skipping paper entry without a paper_id in mailing %s: %r", + mailing_date, + p, + ) + continue if pid not in papers_by_id: papers_by_id[pid] = [] papers_by_id[pid].append(p) From c33c475dd94253a9c7c2692cc0db83685ecc11c4 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 10 Mar 2026 21:38:54 -0700 Subject: [PATCH 12/20] Fix: default sqlite, document #24 --- .github/workflows/actions.yml | 1 + config/test_settings.py | 10 +++-- requirements.txt | 1 - wg21_paper_tracker/admin.py | 4 +- wg21_paper_tracker/pipeline.py | 55 +++++++++++------------ wg21_paper_tracker/services.py | 2 + wg21_paper_tracker/tests/test_services.py | 7 +++ 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 11e32e5..f9ffb47 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -84,6 +84,7 @@ jobs: - name: Test with pytest env: DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres + TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres SECRET_KEY: for-testing-only DJANGO_SETTINGS_MODULE: config.test_settings run: | diff --git a/config/test_settings.py b/config/test_settings.py index 3c2e1db..2649ac5 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -7,10 +7,14 @@ from pathlib import Path from .settings import * # noqa: F401, F403 +from .settings import env -# Use SQLite in-memory for speed when DATABASE_URL not set (e.g. local pytest). -# CI can set DATABASE_URL=sqlite:///test.sqlite3 or leave unset for :memory: -if not os.environ.get("DATABASE_URL", "").strip(): +# Use SQLite in-memory for tests by default so no PostgreSQL is required. +# Set TEST_DATABASE_URL to run tests against PostgreSQL (e.g. in CI). +_test_db_url = os.environ.get("TEST_DATABASE_URL", "").strip() +if _test_db_url: + DATABASES = {"default": env.db("TEST_DATABASE_URL")} +else: DATABASES = { "default": { "ENGINE": "django.db.backends.sqlite3", diff --git a/requirements.txt b/requirements.txt index 289d486..a94ab8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,5 @@ selenium>=4.35 # wg21_paper_tracker app beautifulsoup4>=4.12.0 -lxml>=5.0.0 google-cloud-run>=0.10.1 google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py index 86784ae..bd57f4c 100644 --- a/wg21_paper_tracker/admin.py +++ b/wg21_paper_tracker/admin.py @@ -13,6 +13,7 @@ class WG21PaperAuthorInline(admin.TabularInline): model = WG21PaperAuthor extra = 1 raw_id_fields = ("profile",) + ordering = ("author_order", "id") @admin.register(WG21Paper) @@ -34,6 +35,7 @@ class WG21PaperAdmin(admin.ModelAdmin): @admin.register(WG21PaperAuthor) class WG21PaperAuthorAdmin(admin.ModelAdmin): - list_display = ("paper", "profile", "created_at") + list_display = ("paper", "profile", "author_order", "created_at") search_fields = ("paper__paper_id", "profile__display_name") raw_id_fields = ("paper", "profile") + ordering = ("paper", "author_order", "id") diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index d7f96be..9ea7550 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -243,6 +243,33 @@ def format_priority(ext: str) -> int: local_path = raw_dir / filename url = best_paper["url"] + # Persist paper row before transfer so failed downloads remain retry candidates + doc_date_str = best_paper.get("document_date") + from django.utils.dateparse import parse_date + + doc_date = None + if doc_date_str: + try: + doc_date = parse_date(doc_date_str) + except Exception as e: + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + doc_date = None + + paper_obj, _created = get_or_create_paper( + paper_id=pid, + url=url, + title=best_paper["title"], + document_date=doc_date, + mailing=mailing_obj, + subgroup=best_paper["subgroup"], + author_names=best_paper["authors"], + year=year, + ) + # Download if _download_file(url, local_path): uploaded = False @@ -257,34 +284,6 @@ def format_priority(ext: str) -> int: pid, ) - # Persist DB - doc_date_str = best_paper["document_date"] - # Parse date if available - from django.utils.dateparse import parse_date - - doc_date = None - if doc_date_str: - try: - doc_date = parse_date(doc_date_str) - except Exception as e: - logger.warning( - "Failed to parse document date: %s: %s", - doc_date_str, - e, - ) - doc_date = None - - paper_obj, _created = get_or_create_paper( - paper_id=pid, - url=url, - title=best_paper["title"], - document_date=doc_date, - mailing=mailing_obj, - subgroup=best_paper["subgroup"], - author_names=best_paper["authors"], - year=year, - ) - if uploaded: paper_obj.is_downloaded = True paper_obj.save(update_fields=["is_downloaded"]) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index f773b75..8030424 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -151,6 +151,8 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() + if year is None: + raise ValueError("year is required; pass 0 explicitly for placeholder papers") year_val = _normalize_year(year) WG21Paper.objects.filter( paper_id=paper_id, diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py index 023f15c..bd3a3e2 100644 --- a/wg21_paper_tracker/tests/test_services.py +++ b/wg21_paper_tracker/tests/test_services.py @@ -218,6 +218,13 @@ def test_get_or_create_paper_sets_author_order(db): # --- mark_paper_downloaded --- +@pytest.mark.django_db +def test_mark_paper_downloaded_requires_year(db): + """mark_paper_downloaded raises ValueError when year is omitted.""" + with pytest.raises(ValueError, match="year is required"): + mark_paper_downloaded("p1000r0") + + @pytest.mark.django_db def test_mark_paper_downloaded_sets_flag(db): """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year).""" From 93ee8b7e3906ddf522134d074ded4caf6eaa3f31 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 08:18:51 -0700 Subject: [PATCH 13/20] Fix: author profile merge avoidance, blank paper_id rejection, mailing race recovery, logging and Ruf #24 --- cppa_user_tracker/services.py | 12 ++++------ cppa_user_tracker/tests/test_services.py | 23 +++++++++++++++---- .../converters/docling_converter.py | 6 ++--- .../converters/openai_converter.py | 19 ++++++++------- .../commands/import_wg21_metadata_from_csv.py | 9 ++++---- wg21_paper_tracker/pipeline.py | 2 +- wg21_paper_tracker/services.py | 2 ++ 7 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 152bbc2..34da007 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -395,11 +395,7 @@ def get_or_create_wg21_paper_author_profile( for p in candidates: if p.emails.filter(email=email_val).exists(): return p, False - profile = candidates[0] - if email_val and not profile.emails.filter(email=email_val).exists(): - add_email( - profile, - email_val, - is_primary=not profile.emails.filter(is_active=True).exists(), - ) - return profile, False + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) + add_email(profile, email_val, is_primary=True) + return profile, True + return candidates[0], False diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index 6d4d85b..75775ed 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -608,6 +608,19 @@ def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): assert profile.id == existing.id +@pytest.mark.django_db +def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_adds_email(): + """Existing single match gets the supplied email attached.""" + existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") + profile, created = services.get_or_create_wg21_paper_author_profile( + display_name="Solo Author", + email="solo@example.com", + ) + assert created is False + assert profile.id == existing.id + assert profile.emails.filter(email="solo@example.com").exists() + + @pytest.mark.django_db def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first(): """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email.""" @@ -635,8 +648,8 @@ def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_se @pytest.mark.django_db -def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_returns_first(): - """get_or_create_wg21_paper_author_profile returns first when email provided but no match.""" +def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_creates_new_profile(): + """When multiple match and email matches none, a new profile is created with that email.""" first = WG21PaperAuthorProfile.objects.create(display_name="Other Name") second = WG21PaperAuthorProfile.objects.create(display_name="Other Name") services.add_email(second, "other@example.com", is_primary=True) @@ -644,5 +657,7 @@ def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_no display_name="Other Name", email="nomatch@example.com", ) - assert created is False - assert profile.id == first.id + assert created is True + assert profile.id not in (first.id, second.id) + assert profile.display_name == "Other Name" + assert profile.emails.filter(email="nomatch@example.com").exists() diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py index b9d6067..7e73753 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py @@ -56,8 +56,6 @@ def convert_with_docling(pdf_path: Path) -> Optional[str]: ) return None - except Exception as e: - logger.error( - f"Docling conversion failed for {pdf_path.name}: {str(e)}", exc_info=True - ) + except Exception: + logger.error(f"Docling conversion failed for {pdf_path.name}", exc_info=True) return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 8952f15..7edd988 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -222,17 +222,16 @@ def convert_with_openai(pdf_path: Path) -> Optional[str]: try: logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - # Convert PDF to image files on disk (avoids loading all pages into memory) - tmp_dir, paths = pdf_to_images(pdf_path) - if not paths: - logger.error(f"Failed to convert PDF to images: {pdf_path.name}") - return None - - total_pages = len(paths) - markdown_parts = [] - successful_pages = 0 - try: + # Convert PDF to image files on disk (avoids loading all pages into memory) + tmp_dir, paths = pdf_to_images(pdf_path) + if not paths: + logger.error(f"Failed to convert PDF to images: {pdf_path.name}") + return None + + total_pages = len(paths) + markdown_parts = [] + successful_pages = 0 # Process each page: load one image at a time, convert, then move on for page_num, image_path in enumerate(paths, 1): try: diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index 365a008..e00d64b 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -201,6 +201,8 @@ def handle(self, *args, **options): else: stats["papers_updated"] += 1 except IntegrityError as e: + # Re-resolve mailing (IntegrityError may have come from get_or_create_mailing race) + mailing, _ = get_or_create_mailing(mailing_date, mailing_title) # Duplicate (paper_id, year): fetch existing by same key and update try: lookup_year = year if year is not None else 0 @@ -230,12 +232,11 @@ def handle(self, *args, **options): name ) get_or_create_paper_author(paper, profile, i + 1) - except Exception as inner: + except Exception: stats["skipped"] += 1 - logger.error( - "Error for paper_id=%s (after IntegrityError): %s", + logger.exception( + "Error for paper_id=%s (after IntegrityError).", paper_id, - inner, ) except Exception as e: stats["skipped"] += 1 diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 9ea7550..894cc18 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -286,7 +286,7 @@ def format_priority(ext: str) -> int: if uploaded: paper_obj.is_downloaded = True - paper_obj.save(update_fields=["is_downloaded"]) + paper_obj.save(update_fields=["is_downloaded", "updated_at"]) total_new_papers += 1 # Clean up local file to save space diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 8030424..37bd91d 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -49,6 +49,8 @@ def get_or_create_paper( year: int | None = None, ) -> tuple[WG21Paper, bool]: paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") year_val = _normalize_year(year) def _update_paper(paper: WG21Paper) -> bool: From 61a6c7f1bcdd089c24632ce21b80652438abbe51 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 09:15:03 -0700 Subject: [PATCH 14/20] Fix: author profile merge avoidance, blank paper_id rejection, pipeline validation, mailing race recovery, logging, and API docs #24 --- docs/service_api/cppa_user_tracker.md | 2 +- wg21_paper_tracker/pipeline.py | 62 ++++++++++++++++++++++++--- wg21_paper_tracker/services.py | 2 + 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md index bc89dbd..8f50642 100644 --- a/docs/service_api/cppa_user_tracker.md +++ b/docs/service_api/cppa_user_tracker.md @@ -45,7 +45,7 @@ | Function | Parameter types | Return type | Description | | -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, or multiple exist and one matches the email, returns that profile; otherwise returns the first. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | +| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist and one matches the email, returns that profile. If multiple exist and no email is provided, returns the first. If multiple exist and the supplied email matches none, creates a new profile with that email. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. | --- diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 894cc18..516837e 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -228,8 +228,61 @@ def format_priority(ext: str) -> int: skipped_downloaded += 1 continue + # Filter to entries with required keys and valid types; skip malformed. + valid_list = [] + for p in p_list: + type_val = ( + (p.get("type") or "").strip() + if isinstance(p.get("type"), str) + else "" + ) + url_val = ( + (p.get("url") or "").strip() + if isinstance(p.get("url"), str) + else "" + ) + title_val = ( + (p.get("title") or "").strip() + if isinstance(p.get("title"), str) + else "" + ) + if not type_val or not url_val or not title_val: + logger.debug( + "Skipping malformed paper entry for %s in mailing %s: %r", + pid, + mailing_date, + p, + ) + continue + valid_list.append(p) + + if not valid_list: + logger.warning( + "Skipping paper %s in mailing %s: no valid entries (type, url, title)", + pid, + mailing_date, + ) + continue + # Pick the preferred format: adoc > html > ps > pdf. - best_paper = min(p_list, key=lambda x: format_priority(x["type"])) + best_paper = min( + valid_list, + key=lambda x: format_priority(str(x.get("type") or "").strip()), + ) + url = (best_paper.get("url") or "").strip() + title = (best_paper.get("title") or "").strip() + subgroup = (best_paper.get("subgroup") or "").strip() + authors = best_paper.get("authors") + if not isinstance(authors, list): + authors = [] + if not url or not title: + logger.warning( + "Skipping paper %s in mailing %s due to missing required fields: %r", + pid, + mailing_date, + best_paper, + ) + continue raw_filename = (best_paper.get("filename") or "").strip() filename = Path(raw_filename).name @@ -241,7 +294,6 @@ def format_priority(ext: str) -> int: ) continue local_path = raw_dir / filename - url = best_paper["url"] # Persist paper row before transfer so failed downloads remain retry candidates doc_date_str = best_paper.get("document_date") @@ -262,11 +314,11 @@ def format_priority(ext: str) -> int: paper_obj, _created = get_or_create_paper( paper_id=pid, url=url, - title=best_paper["title"], + title=title, document_date=doc_date, mailing=mailing_obj, - subgroup=best_paper["subgroup"], - author_names=best_paper["authors"], + subgroup=subgroup, + author_names=authors, year=year, ) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 37bd91d..b0be567 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -153,6 +153,8 @@ def get_or_create_paper_author( def mark_paper_downloaded(paper_id: str, year: int | None = None): paper_id = (paper_id or "").strip().lower() + if not paper_id: + raise ValueError("paper_id is required") if year is None: raise ValueError("year is required; pass 0 explicitly for placeholder papers") year_val = _normalize_year(year) From c246241b88f298a4aa2b4562b9a5bad05ef9945f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 11 Mar 2026 10:01:23 -0700 Subject: [PATCH 15/20] Fix: OpenRouter retries, CSV year from parsed date, placeholder race recovery #24 --- .../converters/openai_converter.py | 148 ++++++++++++------ .../commands/import_wg21_metadata_from_csv.py | 13 +- wg21_paper_tracker/services.py | 84 ++++++---- 3 files changed, 155 insertions(+), 90 deletions(-) diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py index 7edd988..66c08b0 100644 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py @@ -10,6 +10,7 @@ import os import shutil import tempfile +import time from pathlib import Path from typing import Optional @@ -148,56 +149,105 @@ def convert_page_with_openai( logger.error("OpenRouter API key is not set") return None - try: - logger.info(f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter") - - url = f"{OPENROUTER_BASE_URL}/chat/completions" - headers = { - "Authorization": f"Bearer {OPENROUTER_API_KEY}", - "Content-Type": "application/json", - } - - payload = { - "model": OPENROUTER_MODEL, - "messages": [ - { - "role": "system", - "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}" - }, - }, - ], - }, - ], - "max_tokens": 4000, - } - - response = requests.post(url, json=payload, headers=headers, timeout=120) - response.raise_for_status() - - result = response.json() - markdown_content = result["choices"][0]["message"]["content"] - - logger.info(f"Successfully converted page {page_num} with OpenAI/OpenRouter") - return markdown_content + url = f"{OPENROUTER_BASE_URL}/chat/completions" + headers = { + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "Content-Type": "application/json", + } + + payload = { + "model": OPENROUTER_MODEL, + "messages": [ + { + "role": "system", + "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_base64}"}, + }, + ], + }, + ], + "max_tokens": 4000, + } + + max_attempts = 3 # initial + 2 retries + retry_delays = [1, 2] # exponential backoff in seconds + + for attempt in range(max_attempts): + try: + logger.info( + f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter" + + (f" (attempt {attempt + 1}/{max_attempts})" if attempt > 0 else "") + ) - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None + response = requests.post(url, json=payload, headers=headers, timeout=120) + response.raise_for_status() + + result = response.json() + markdown_content = result["choices"][0]["message"]["content"] + + logger.info( + f"Successfully converted page {page_num} with OpenAI/OpenRouter" + ) + return markdown_content + + except ( + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + ) as e: + retryable = attempt < max_attempts - 1 + if retryable: + delay = retry_delays[attempt] + logger.warning( + f"Transient error on page {page_num} ({type(e).__name__}), " + f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" + ) + time.sleep(delay) + else: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + except requests.exceptions.HTTPError as e: + status_code = e.response.status_code if e.response is not None else None + retryable = ( + attempt < max_attempts - 1 + and status_code is not None + and (status_code == 429 or 500 <= status_code < 600) + ) + if retryable: + delay = retry_delays[attempt] + logger.warning( + f"HTTP {status_code} on page {page_num}, " + f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" + ) + time.sleep(delay) + else: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + except Exception as e: + logger.error( + f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", + exc_info=True, + ) + return None + + return None def convert_with_openai(pdf_path: Path) -> Optional[str]: diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index e00d64b..feff138 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -135,7 +135,6 @@ def handle(self, *args, **options): stats["rows"] += 1 paper_id = (row.get("paper_id", "") or "").strip().lower() url = row.get("url", "") - document_date = row.get("date", "") if not paper_id or not url: stats["skipped"] += 1 @@ -149,14 +148,14 @@ def handle(self, *args, **options): mailing_date, mailing_title = _resolve_mailing_date( row.get("mailing_date", "") ) - year_str = ( - mailing_date[:4] - if mailing_date and MAILING_DATE_PATTERN.match(mailing_date) - else (document_date[:4] if document_date else None) - ) - year = int(year_str) if year_str and year_str.isdigit() else None try: document_date = _parse_document_date(row.get("date", "")) + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): + year = int(mailing_date[:4]) + elif document_date is not None: + year = document_date.year + else: + year = None title = row.get("title", "") or paper_id subgroup = row.get("subgroup", "") author_names = _author_names_from_csv(row.get("author", "")) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index b0be567..24c7ba6 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional -from django.db import transaction +from django.db import IntegrityError, transaction from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor @@ -36,7 +36,6 @@ def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, b return mailing, created -@transaction.atomic def get_or_create_paper( paper_id: str, url: str, @@ -77,28 +76,49 @@ def _update_paper(paper: WG21Paper) -> bool: paper.save() return updated - if year_val > 0: - # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year - paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() - if paper: - _update_paper(paper) - created = False - else: - placeholder = WG21Paper.objects.filter(paper_id=paper_id, year=0).first() - if placeholder: - placeholder.url = url - placeholder.title = title - placeholder.document_date = document_date - placeholder.mailing = mailing - placeholder.subgroup = subgroup - placeholder.year = year_val - placeholder.save() - paper = placeholder - created = False + try: + with transaction.atomic(): + if year_val > 0: + # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year + paper = WG21Paper.objects.filter( + paper_id=paper_id, year=year_val + ).first() + if paper: + _update_paper(paper) + created = False + else: + placeholder = WG21Paper.objects.filter( + paper_id=paper_id, year=0 + ).first() + if placeholder: + try: + placeholder.url = url + placeholder.title = title + placeholder.document_date = document_date + placeholder.mailing = mailing + placeholder.subgroup = subgroup + placeholder.year = year_val + placeholder.save() + paper = placeholder + created = False + except IntegrityError: + raise # Roll back this transaction; recovery runs below + else: + paper, created = WG21Paper.objects.get_or_create( + paper_id=paper_id, + year=year_val, + defaults={ + "url": url, + "title": title, + "document_date": document_date, + "mailing": mailing, + "subgroup": subgroup, + }, + ) else: paper, created = WG21Paper.objects.get_or_create( paper_id=paper_id, - year=year_val, + year=0, defaults={ "url": url, "title": title, @@ -107,20 +127,16 @@ def _update_paper(paper: WG21Paper) -> bool: "subgroup": subgroup, }, ) - else: - paper, created = WG21Paper.objects.get_or_create( - paper_id=paper_id, - year=0, - defaults={ - "url": url, - "title": title, - "document_date": document_date, - "mailing": mailing, - "subgroup": subgroup, - }, - ) - if not created: + if not created: + _update_paper(paper) + except IntegrityError: + # Placeholder promotion hit (paper_id, year_val) unique constraint; fetch and update canonical row + with transaction.atomic(): + paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first() + if not paper: + raise _update_paper(paper) + created = False if author_names: emails = author_emails or [] From 3e32ee28977eb2780ef898554481a12e9ca72dcd Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 21 Mar 2026 04:43:12 -0700 Subject: [PATCH 16/20] refactor(wg21): pipeline dispatch + mailing range; remove Cloud Run stack #113 --- config/settings.py | 13 +- docs/operations/WG21_Cloud_Run.md | 65 -- docs/operations/WG21_GitHub_Dispatch.md | 69 ++ requirements.txt | 2 - wg21_paper_tracker/cloud_run_job/Dockerfile | 28 - .../cloud_run_job/converters/__init__.py | 9 - .../converters/docling_converter.py | 61 -- .../converters/openai_converter.py | 338 ---------- .../converters/pdfplumber_converter.py | 102 --- wg21_paper_tracker/cloud_run_job/main.py | 118 ---- .../cloud_run_job/requirements.txt | 6 - wg21_paper_tracker/fetcher.py | 143 +++-- .../commands/import_wg21_metadata_from_csv.py | 237 ++++--- .../commands/run_wg21_paper_tracker.py | 189 ++++-- wg21_paper_tracker/pipeline.py | 591 +++++++++--------- wg21_paper_tracker/tests/test_commands.py | 120 ++++ wg21_paper_tracker/tests/test_fetcher.py | 68 ++ wg21_paper_tracker/tests/test_pipeline.py | 282 +++++---- 18 files changed, 1084 insertions(+), 1357 deletions(-) delete mode 100644 docs/operations/WG21_Cloud_Run.md create mode 100644 docs/operations/WG21_GitHub_Dispatch.md delete mode 100644 wg21_paper_tracker/cloud_run_job/Dockerfile delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/__init__.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/docling_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/openai_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py delete mode 100644 wg21_paper_tracker/cloud_run_job/main.py delete mode 100644 wg21_paper_tracker/cloud_run_job/requirements.txt diff --git a/config/settings.py b/config/settings.py index 17f549a..b92f137 100644 --- a/config/settings.py +++ b/config/settings.py @@ -217,11 +217,14 @@ ).resolve() # WG21 Paper Tracker Configuration -WG21_GCS_BUCKET = (env("WG21_GCS_BUCKET", default="") or "").strip() -GCP_PROJECT_ID = (env("GCP_PROJECT_ID", default="") or "").strip() -GCP_LOCATION = (env("GCP_LOCATION", default="us-central1") or "").strip() -WG21_CLOUD_RUN_JOB_NAME = (env("WG21_CLOUD_RUN_JOB_NAME", default="") or "").strip() -WG21_CLOUD_RUN_ENABLED = env.bool("WG21_CLOUD_RUN_ENABLED", default=False) +WG21_GITHUB_DISPATCH_ENABLED = env.bool("WG21_GITHUB_DISPATCH_ENABLED", default=False) +WG21_GITHUB_DISPATCH_REPO = (env("WG21_GITHUB_DISPATCH_REPO", default="") or "").strip() +WG21_GITHUB_DISPATCH_TOKEN = ( + env("WG21_GITHUB_DISPATCH_TOKEN", default="") or "" +).strip() +WG21_GITHUB_DISPATCH_EVENT_TYPE = ( + env("WG21_GITHUB_DISPATCH_EVENT_TYPE", default="wg21_papers_convert") or "" +).strip() or "wg21_papers_convert" # Logging - project-wide configuration for app commands (console + rotating file) LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs"))) diff --git a/docs/operations/WG21_Cloud_Run.md b/docs/operations/WG21_Cloud_Run.md deleted file mode 100644 index b1caccf..0000000 --- a/docs/operations/WG21_Cloud_Run.md +++ /dev/null @@ -1,65 +0,0 @@ -# WG21 Paper Conversion Cloud Run Job - -The PDF-to-Markdown conversion for WG21 papers is computationally heavy and requires system packages like `poppler`. It is separated from the main Django project and runs as a Google Cloud Run Job. - -When `WG21_CLOUD_RUN_ENABLED=true` and `WG21_CLOUD_RUN_JOB_NAME` is set, the Django tracker (`run_wg21_paper_tracker`) triggers the configured Cloud Run job after uploading new papers. - -## 1. Setup Google Cloud Storage - -Create a GCS bucket (e.g., `wg21-data-collector`). - -Ensure your Django app has the following environment variables configured: - -- `WG21_GCS_BUCKET`: The name of the GCS bucket. -- `GCP_PROJECT_ID`: Your GCP project ID. -- `WG21_CLOUD_RUN_JOB_NAME`: (Optional) The name of the deployed Cloud Run job (e.g. `wg21-convert`). No default; leave unset if you only use GCS uploads without triggering the job. -- `WG21_CLOUD_RUN_ENABLED`: (Optional, default `false`) Set to `true` to allow the tracker to trigger the Cloud Run conversion job when new papers are uploaded. Keeps the trigger optional even when project and bucket are set. -- `GCP_LOCATION`: (Optional, defaults to `us-central1`) Region for the Cloud Run job. - -## 2. Build and Push the Docker Image - -Navigate to the Cloud Run job directory: - -```bash -cd wg21_paper_tracker/cloud_run_job/ -``` - -Build the Docker image. Replace `[PROJECT_ID]` with your GCP Project ID: - -```bash -docker build -t gcr.io/[PROJECT_ID]/wg21-convert . -``` - -Push the image to Google Container Registry (or Artifact Registry): - -```bash -docker push gcr.io/[PROJECT_ID]/wg21-convert -``` - -## 3. Create the Cloud Run Job - -Create the job in Google Cloud. We recommend allocating sufficient memory and CPU since Docling and PDFPlumber are resource-intensive. - -```bash -gcloud run jobs create wg21-convert \ - --image gcr.io/[PROJECT_ID]/wg21-convert \ - --memory 8Gi \ - --cpu 4 \ - --region us-central1 \ - --set-env-vars WG21_GCS_BUCKET=wg21-data-collector -``` - -Provide `OPENROUTER_API_KEY` via Cloud Run secret injection (e.g. [Secret Manager](https://cloud.google.com/run/docs/configuring/secrets)) rather than inline in `--set-env-vars`, to avoid leaking the key into shell history, CI logs, or audit trails. - -## 4. Service Account & IAM Permissions - -1. **Tracker Permission:** The environment running the Django app (e.g., Celery worker or Scheduler) must run under a Service Account that has the `Cloud Run Invoker` (`roles/run.invoker`) role to trigger the job via the API. -2. **GCS Access:** Both the Django application and the Cloud Run job require read/write access to the GCS bucket (`roles/storage.objectAdmin`). - -## 5. Flow Summary - -1. **Daily (e.g. 1 AM)**: The `run_wg21_paper_tracker` command runs. -2. It checks the WG21 site for new mailings. -3. If found, it downloads PDFs and uploads them directly to `gs:///raw/wg21_paper_tracker///`. -4. If Cloud Run triggering is enabled, it calls the configured Cloud Run job. -5. The Cloud Run Job then spins up, reads the new PDFs from GCS, converts them, and uploads the `.md` results to `gs:///converted/wg21_papers//`. diff --git a/docs/operations/WG21_GitHub_Dispatch.md b/docs/operations/WG21_GitHub_Dispatch.md new file mode 100644 index 0000000..49046b0 --- /dev/null +++ b/docs/operations/WG21_GitHub_Dispatch.md @@ -0,0 +1,69 @@ +# WG21 Paper Tracker → GitHub Actions (`repository_dispatch`) + +The Django app **`run_wg21_paper_tracker`** scrapes WG21 mailings and stores paper metadata in the database. It does **not** download PDFs or other documents. When **new** paper rows are created in a run, it can send **one** [repository dispatch](https://docs.github.com/en/rest/repos/repos#create-a-repository-dispatch-event) to another GitHub repository so a workflow there fetches each URL and runs conversion (e.g. PDF → Markdown). + +## Environment variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `WG21_GITHUB_DISPATCH_ENABLED` | No (default `false`) | Set to `true` to send `repository_dispatch` when there are new papers. | +| `WG21_GITHUB_DISPATCH_REPO` | Yes, if enabled | Target repo as `owner/repo` (the repo whose workflow will run). | +| `WG21_GITHUB_DISPATCH_TOKEN` | Yes, if enabled | PAT or token with permission to create repository dispatch events on that repo (classic PAT: `repo` scope for private repos). | +| `WG21_GITHUB_DISPATCH_EVENT_TYPE` | No | Must match `on.repository_dispatch.types` in the target workflow. Default: `wg21_papers_convert`. | + +## `client_payload` contract + +The JSON body includes only a list of URL strings: + +```json +{ + "papers": [ + "https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/…", + "https://www.open-std.org/…" + ] +} +``` + +- **`papers`**: array of strings (WG21 document URLs), all new papers from **that** pipeline run in a **single** event. +- There is **no** `new_paper_count` field; use `length(papers)` in the workflow if needed. + +## Target repository workflow (example) + +```yaml +on: + repository_dispatch: + types: [wg21_papers_convert] + +jobs: + convert: + runs-on: ubuntu-latest + steps: + - name: URLs + run: | + echo '${{ toJson(github.event.client_payload.papers) }}' + # Fetch each URL, convert, store artifacts / upload elsewhere +``` + +In expressions, `github.event.client_payload.papers` is a JSON array of strings. + +## Token security + +Store `WG21_GITHUB_DISPATCH_TOKEN` in a secret manager or CI secret—never commit it. Prefer a fine-grained PAT scoped to the conversion repo if possible. + +## Payload size + +Very large mailings could produce many URLs in one payload. If you approach GitHub or runner limits, document a split strategy (multiple dispatches) as an edge case; the default is one dispatch per tracker run with the full list. + +## CLI options + +- **`--from-date YYYY-MM`**: Process mailings with `mailing_date >= YYYY-MM` (WG21 / CSV style). Backfills from that key onward when used alone. +- **`--to-date YYYY-MM`**: Upper bound: `mailing_date <= YYYY-MM`. With `--from-date`, the run uses the inclusive range `[from, to]`. Without `--from-date`, behavior stays incremental (only mailings **newer than** the latest `WG21Mailing` in the DB), but capped at `to`—useful to avoid pulling very new mailings in a controlled run. +- **`--dry-run`**: Log only; do not run the pipeline or send dispatch. + +## Flow summary + +1. Scheduler runs `run_wg21_paper_tracker` (optionally with `--from-date` / `--to-date`). +2. Pipeline fetches mailings, upserts `WG21Mailing` / `WG21Paper` (metadata only). +3. For each row **newly created** in that run, its document URL is collected. +4. If the list is non-empty and dispatch is enabled, the app POSTs once to `POST /repos/{owner}/{repo}/dispatches` with `event_type` and `client_payload: { "papers": [ ... ] }`. +5. The conversion repo’s workflow runs and downloads each URL. diff --git a/requirements.txt b/requirements.txt index a94ab8d..80db52f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,5 +15,3 @@ selenium>=4.35 # wg21_paper_tracker app beautifulsoup4>=4.12.0 -google-cloud-run>=0.10.1 -google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/cloud_run_job/Dockerfile b/wg21_paper_tracker/cloud_run_job/Dockerfile deleted file mode 100644 index d52244b..0000000 --- a/wg21_paper_tracker/cloud_run_job/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Use an official Python runtime as a parent image -FROM python:3.11-slim - -# Set working directory -WORKDIR /app - -# Install system dependencies required by converters (e.g. Poppler for PDF image extraction) -RUN apt-get update && apt-get install -y --no-install-recommends \ - poppler-utils \ - libgl1-mesa-glx \ - libglib2.0-0 \ - && rm -rf /var/lib/apt/lists/* - -RUN groupadd -r app && useradd -r -g app app - -# Copy requirements -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application files -COPY --chown=app:app . . - -USER app - -# Run the main script -CMD ["python", "main.py"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/__init__.py b/wg21_paper_tracker/cloud_run_job/converters/__init__.py deleted file mode 100644 index 515d30a..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -PDF to Markdown converters module. -""" - -from .docling_converter import convert_with_docling -from .pdfplumber_converter import convert_with_pdfplumber -from .openai_converter import convert_with_openai - -__all__ = ["convert_with_docling", "convert_with_pdfplumber", "convert_with_openai"] diff --git a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py b/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py deleted file mode 100644 index 7e73753..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/docling_converter.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Docling-based PDF to Markdown converter. -""" - -from pathlib import Path -from typing import Optional -import logging - -logger = logging.getLogger(__name__) - -try: - from docling.document_converter import DocumentConverter # type: ignore[import-untyped] - from docling.datamodel.base_models import InputFormat # type: ignore[import-untyped] - - DOCLING_AVAILABLE = True -except ImportError: - DocumentConverter = None # type: ignore[assignment,misc] - InputFormat = None # type: ignore[assignment,misc] - DOCLING_AVAILABLE = False - logger.warning("Docling not available. Install with: pip install docling") - - -def convert_with_docling(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using Docling. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not DOCLING_AVAILABLE or DocumentConverter is None: - logger.error("Docling is not available") - return None - - try: - logger.info(f"Attempting Docling conversion for: {pdf_path.name}") - - # Initialize converter - converter = DocumentConverter() - - # Convert PDF to document - result = converter.convert(pdf_path) - - # Extract markdown - markdown_content = result.document.export_to_markdown() - - if markdown_content and len(markdown_content.strip()) > 0: - logger.info(f"Docling conversion successful for: {pdf_path.name}") - logger.info(f"Extracted {len(markdown_content)} characters") - return markdown_content - else: - logger.warning( - f"Docling conversion returned empty content for: {pdf_path.name}" - ) - return None - - except Exception: - logger.error(f"Docling conversion failed for {pdf_path.name}", exc_info=True) - return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py b/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py deleted file mode 100644 index 66c08b0..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/openai_converter.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -OpenAI/OpenRouter-based PDF to Markdown converter with OCR. -""" - -from __future__ import annotations - -import base64 -import io -import logging -import os -import shutil -import tempfile -import time -from pathlib import Path -from typing import Optional - -import requests - -logger = logging.getLogger(__name__) - -# Base configuration fallback - - -OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") -OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" -OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o") - -try: - from pdf2image import convert_from_path - from PIL import Image, ImageOps - - PDF2IMAGE_AVAILABLE = True -except ImportError: - PDF2IMAGE_AVAILABLE = False - logger.warning( - "pdf2image/PIL not available. Install with: pip install pdf2image pillow" - ) - - -def pdf_to_images(pdf_path: Path) -> tuple[Optional[Path], list[Path]]: - """ - Convert PDF pages to image files on disk (one per page) to avoid loading all into memory. - - Writes images into a temporary directory and returns (tmp_dir, paths). Caller must process - each path and then remove tmp_dir (e.g. shutil.rmtree) so only the current page is resident. - - Note: pdf2image should automatically handle PDF rotation metadata; we also apply - additional rotation correction in correct_image_rotation() when loading each image. - - Args: - pdf_path: Path to the PDF file. - - Returns: - (tmp_dir, list of image paths). tmp_dir is None on failure or if pdf2image unavailable; - paths are in page order. Caller must cleanup tmp_dir when not None. - """ - if not PDF2IMAGE_AVAILABLE: - logger.error("pdf2image is not available") - return (None, []) - - try: - logger.info(f"Converting PDF to images: {pdf_path.name}") - tmp_dir = Path(tempfile.mkdtemp(prefix="wg21_pdf_")) - try: - path_strs = convert_from_path( - pdf_path, - dpi=200, - paths_only=True, - output_folder=str(tmp_dir), - ) - paths = [Path(p) for p in path_strs] - logger.info(f"Converted {len(paths)} pages to images") - return (tmp_dir, paths) - except Exception: - shutil.rmtree(tmp_dir, ignore_errors=True) - raise - except Exception as e: - logger.error(f"Failed to convert PDF to images: {str(e)}", exc_info=True) - return (None, []) - - -def correct_image_rotation(image: Image.Image) -> Image.Image: - """ - Correct image rotation using EXIF data and heuristics. - - Args: - image: PIL Image object. - - Returns: - Corrected PIL Image object. - """ - try: - # First, try to correct using EXIF orientation data - # This handles images that have rotation metadata - corrected_image = ImageOps.exif_transpose(image) - - # If the image was rotated, log it - if corrected_image != image: - logger.debug("Image rotation corrected using EXIF data") - return corrected_image - - # If no EXIF data, check if image might be rotated - # For PDF pages, we can check if width > height suggests landscape - # But we'll keep the original orientation as PDFs can be in any orientation - # The OpenAI vision model can handle rotated text, but it's better to correct it - - return corrected_image - - except Exception as e: - logger.warning(f"Error correcting image rotation: {str(e)}") - return image - - -def image_to_base64(image: Image.Image) -> str: - """ - Convert PIL Image to base64 string. - Automatically corrects rotation before encoding. - - Args: - image: PIL Image object. - - Returns: - Base64 encoded string. - """ - # Correct rotation before encoding - corrected_image = correct_image_rotation(image) - - buffered = io.BytesIO() - corrected_image.save(buffered, format="PNG") - img_str = base64.b64encode(buffered.getvalue()).decode() - return img_str - - -def convert_page_with_openai( - image_base64: str, page_num: int, total_pages: int -) -> Optional[str]: - """ - Convert a single page image to markdown using OpenAI/OpenRouter. - - Args: - image_base64: Base64 encoded image string. - page_num: Current page number. - total_pages: Total number of pages. - - Returns: - Markdown content for the page, or None if conversion fails. - """ - if not OPENROUTER_API_KEY: - logger.error("OpenRouter API key is not set") - return None - - url = f"{OPENROUTER_BASE_URL}/chat/completions" - headers = { - "Authorization": f"Bearer {OPENROUTER_API_KEY}", - "Content-Type": "application/json", - } - - payload = { - "model": OPENROUTER_MODEL, - "messages": [ - { - "role": "system", - "content": "You are a document conversion assistant. Convert the provided PDF page image to clean, well-formatted Markdown. Preserve the structure, formatting, tables, and content as accurately as possible. Use proper markdown syntax for headers, lists, tables, and code blocks. If the image appears rotated, read the text in its current orientation and convert it correctly.", - }, - { - "role": "user", - "content": [ - { - "type": "text", - "text": f"Convert this PDF page ({page_num} of {total_pages}) to Markdown format. Preserve all text, structure, and formatting. If the page appears rotated, read and convert the text in its correct orientation.", - }, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - ], - }, - ], - "max_tokens": 4000, - } - - max_attempts = 3 # initial + 2 retries - retry_delays = [1, 2] # exponential backoff in seconds - - for attempt in range(max_attempts): - try: - logger.info( - f"Converting page {page_num}/{total_pages} with OpenAI/OpenRouter" - + (f" (attempt {attempt + 1}/{max_attempts})" if attempt > 0 else "") - ) - - response = requests.post(url, json=payload, headers=headers, timeout=120) - response.raise_for_status() - - result = response.json() - markdown_content = result["choices"][0]["message"]["content"] - - logger.info( - f"Successfully converted page {page_num} with OpenAI/OpenRouter" - ) - return markdown_content - - except ( - requests.exceptions.Timeout, - requests.exceptions.ConnectionError, - ) as e: - retryable = attempt < max_attempts - 1 - if retryable: - delay = retry_delays[attempt] - logger.warning( - f"Transient error on page {page_num} ({type(e).__name__}), " - f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" - ) - time.sleep(delay) - else: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - except requests.exceptions.HTTPError as e: - status_code = e.response.status_code if e.response is not None else None - retryable = ( - attempt < max_attempts - 1 - and status_code is not None - and (status_code == 429 or 500 <= status_code < 600) - ) - if retryable: - delay = retry_delays[attempt] - logger.warning( - f"HTTP {status_code} on page {page_num}, " - f"retrying in {delay}s (attempt {attempt + 1}/{max_attempts})" - ) - time.sleep(delay) - else: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for page {page_num}: {str(e)}", - exc_info=True, - ) - return None - - return None - - -def convert_with_openai(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using OpenAI/OpenRouter with OCR. - Processes each page as an image. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not OPENROUTER_API_KEY: - logger.error("OpenRouter API key is not set in environment variables") - return None - - if not PDF2IMAGE_AVAILABLE: - logger.error("pdf2image is required for OpenAI conversion") - return None - - try: - logger.info(f"Attempting OpenAI/OpenRouter conversion for: {pdf_path.name}") - - try: - # Convert PDF to image files on disk (avoids loading all pages into memory) - tmp_dir, paths = pdf_to_images(pdf_path) - if not paths: - logger.error(f"Failed to convert PDF to images: {pdf_path.name}") - return None - - total_pages = len(paths) - markdown_parts = [] - successful_pages = 0 - # Process each page: load one image at a time, convert, then move on - for page_num, image_path in enumerate(paths, 1): - try: - with Image.open(image_path) as img: - img.load() - image_base64 = image_to_base64(img) - # Convert page with OpenAI - page_markdown = convert_page_with_openai( - image_base64, page_num, total_pages - ) - - if page_markdown: - markdown_parts.append(page_markdown) - markdown_parts.append("\n\n") - successful_pages += 1 - else: - logger.warning(f"Failed to convert page {page_num} with OpenAI") - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - - except Exception as e: - logger.error( - f"Error processing page {page_num}: {str(e)}", - exc_info=True, - ) - markdown_parts.append( - f"## Page {page_num}\n\n*[Page content unavailable]*\n\n" - ) - continue - finally: - if tmp_dir is not None: - shutil.rmtree(tmp_dir, ignore_errors=True) - - markdown_content = "".join(markdown_parts) - - if successful_pages > 0 and markdown_content.strip(): - logger.info(f"OpenAI/OpenRouter conversion successful for: {pdf_path.name}") - logger.info( - f"Extracted {len(markdown_content)} characters from {total_pages} pages" - ) - return markdown_content - logger.warning( - "OpenAI/OpenRouter conversion produced no usable pages for: %s", - pdf_path.name, - ) - return None - - except Exception as e: - logger.error( - f"OpenAI/OpenRouter conversion failed for {pdf_path.name}: {str(e)}", - exc_info=True, - ) - return None diff --git a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py b/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py deleted file mode 100644 index fb36c4e..0000000 --- a/wg21_paper_tracker/cloud_run_job/converters/pdfplumber_converter.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -PDFPlumber-based PDF to Markdown converter. -""" - -from pathlib import Path -from typing import Optional -import logging - -logger = logging.getLogger(__name__) - -try: - import pdfplumber - - PDFPLUMBER_AVAILABLE = True -except ImportError: - PDFPLUMBER_AVAILABLE = False - logger.warning("PDFPlumber not available. Install with: pip install pdfplumber") - - -def convert_with_pdfplumber(pdf_path: Path) -> Optional[str]: - """ - Convert PDF to Markdown using PDFPlumber. - - Args: - pdf_path: Path to the PDF file. - - Returns: - Markdown content as string, or None if conversion fails. - """ - if not PDFPLUMBER_AVAILABLE: - logger.error("PDFPlumber is not available") - return None - - try: - logger.info(f"Attempting PDFPlumber conversion for: {pdf_path.name}") - - markdown_parts = [] - - with pdfplumber.open(pdf_path) as pdf: - total_pages = len(pdf.pages) - logger.info(f"Processing {total_pages} pages with PDFPlumber") - - for page_num, page in enumerate(pdf.pages, 1): - try: - # Extract text from page - text = page.extract_text() - - if text: - markdown_parts.append(text.replace("\n", " \n")) - markdown_parts.append("\n\n") - - # Extract tables if any - tables = page.extract_tables() - if tables: - for table in tables: - if table: - markdown_parts.append("\n### Table\n\n") - first_row = True - # Convert table to markdown format - for row in table: - if row: - markdown_parts.append( - "| " - + " | ".join( - "" if cell is None else str(cell) - for cell in row - ) - + " |\n" - ) - if first_row: - markdown_parts.append( - "| " - + " | ".join("---" for _ in row) - + " |\n" - ) - first_row = False - markdown_parts.append("\n") - - except Exception as e: - logger.warning( - f"Error processing page {page_num} of {pdf_path.name}: {str(e)}" - ) - continue - - markdown_content = "".join(markdown_parts) - - if markdown_content and len(markdown_content.strip()) > 0: - logger.info(f"PDFPlumber conversion successful for: {pdf_path.name}") - logger.info(f"Extracted {len(markdown_content)} characters") - return markdown_content - else: - logger.warning( - f"PDFPlumber conversion returned empty content for: {pdf_path.name}" - ) - return None - - except Exception as e: - logger.error( - f"PDFPlumber conversion failed for {pdf_path.name}: {str(e)}", - exc_info=True, - ) - return None diff --git a/wg21_paper_tracker/cloud_run_job/main.py b/wg21_paper_tracker/cloud_run_job/main.py deleted file mode 100644 index 61c57dc..0000000 --- a/wg21_paper_tracker/cloud_run_job/main.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import logging -from pathlib import Path -import tempfile -from typing import Optional - -from google.cloud import storage - -from converters.docling_converter import convert_with_docling -from converters.pdfplumber_converter import convert_with_pdfplumber -from converters.openai_converter import convert_with_openai - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger(__name__) - -MIN_CONTENT_LENGTH = 50 - - -def is_content_valid(content: Optional[str]) -> bool: - if not content: - return False - content_stripped = content.strip() - if len(content_stripped) < MIN_CONTENT_LENGTH: - return False - error_patterns = [ - "traceback", - "exception:", - "error:", - "failed to", - "unable to convert", - "conversion failed", - "error processing", - ] - content_lower = content_stripped.lower() - first_part = content_lower[:1000] - for pattern in error_patterns: - if pattern in first_part: - if pattern in ("error:", "exception:"): - return False - idx = first_part.find(pattern) - if idx < 100: - return False - return True - - -def convert_pdf_to_md(pdf_path: Path) -> str: - logger.info("Attempting Docling conversion...") - content = convert_with_docling(pdf_path) - if is_content_valid(content): - return content - - logger.info("Attempting PDFPlumber conversion...") - content = convert_with_pdfplumber(pdf_path) - if is_content_valid(content): - return content - - logger.info("Attempting OpenAI conversion...") - content = convert_with_openai(pdf_path) - if is_content_valid(content): - return content - - return "" - - -def main(): - bucket_name = os.getenv("WG21_GCS_BUCKET") - if not bucket_name: - logger.error("WG21_GCS_BUCKET env var not set.") - raise RuntimeError("WG21_GCS_BUCKET env var not set.") - - client = storage.Client() - bucket = client.bucket(bucket_name) - - raw_prefix = "raw/wg21_paper_tracker/" - converted_prefix = "converted/wg21_papers/" - - blobs = client.list_blobs(bucket, prefix=raw_prefix) - - with tempfile.TemporaryDirectory() as tmpdir: - for blob in blobs: - if not blob.name.lower().endswith(".pdf"): - continue - - local_pdf_path = Path(tmpdir) / "temp.pdf" - try: - # e.g. raw/wg21_paper_tracker/2025/2025-02/p0149r1.pdf -> 2025/2025-02/p0149r1.pdf - relative_path = blob.name[len(raw_prefix) :] - md_relative_path = relative_path.rsplit(".", 1)[0] + ".md" - md_blob_name = f"{converted_prefix}{md_relative_path}" - - md_blob = bucket.blob(md_blob_name) - if md_blob.exists(): - logger.info("Skipping %s, MD already exists.", blob.name) - continue - - logger.info("Downloading %s to process...", blob.name) - blob.download_to_filename(str(local_pdf_path)) - - logger.info("Converting %s...", blob.name) - md_content = convert_pdf_to_md(local_pdf_path) - - if md_content: - md_blob.upload_from_string(md_content, content_type="text/markdown") - logger.info("Successfully converted and uploaded %s", md_blob_name) - else: - logger.error("Failed to convert %s", blob.name) - except Exception: - logger.exception("Failed processing %s", blob.name) - finally: - if local_pdf_path.exists(): - local_pdf_path.unlink() - - -if __name__ == "__main__": - main() diff --git a/wg21_paper_tracker/cloud_run_job/requirements.txt b/wg21_paper_tracker/cloud_run_job/requirements.txt deleted file mode 100644 index 82422b1..0000000 --- a/wg21_paper_tracker/cloud_run_job/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -docling>=1.0.0 -pdfplumber>=0.10.0 -pdf2image>=1.16.0 -Pillow>=12.1.1 -requests>=2.31.0 -google-cloud-storage>=2.14.0 diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py index 2c6ad03..05f6e98 100644 --- a/wg21_paper_tracker/fetcher.py +++ b/wg21_paper_tracker/fetcher.py @@ -18,6 +18,86 @@ BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers" _MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$") +# Paper link in first column: e.g. p1234r0.pdf, n4920.html, sd-9.md +_PAPER_LINK_PATTERN = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) + + +def extract_paper_metadata_from_table_row( + cells: list[Tag], + page_url: str, +) -> Optional[dict]: + """ + Extract paper metadata from a WG21 mailing table row (td/th cells). + + Current year pages (e.g. 2026) use eight columns:: + + WG21 Number | Title | Author | Document Date | Mailing Date | + Previous Version | Subgroup | Disposition + + So **subgroup is index 6**, not 4. Index 4 is *mailing date* (string as shown on the site). + + Older pages used a shorter row (five data columns); then subgroup was at index 4. + If ``len(cells) >= 8`` we use the 8-column layout; otherwise we keep the legacy mapping. + """ + if not cells: + return None + + first_cell = cells[0] + base = urllib.parse.urlparse(BASE_URL) + + title = "" + if len(cells) > 1: + title = cells[1].text.strip() + + authors: list[str] = [] + if len(cells) > 2: + authors_raw = cells[2].text.strip() + if authors_raw: + authors = [ + a.strip() for a in re.split(r",| and ", authors_raw) if a.strip() + ] + + document_date = None + if len(cells) > 3: + date_str = cells[3].text.strip() + if date_str: + document_date = date_str + + # 8+ columns: mailing date [4], previous version [5], subgroup [6], disposition [7] + subgroup = "" + if len(cells) >= 8: + subgroup = cells[6].text.strip() + elif len(cells) > 4: + subgroup = cells[4].text.strip() + + for link in first_cell.find_all("a", href=True): + href = link.get("href", "") + match = _PAPER_LINK_PATTERN.search(href) + if not match: + continue + + paper_url = urllib.parse.urljoin(page_url, href) + parsed = urllib.parse.urlparse(paper_url) + if parsed.scheme not in ("https", "http") or parsed.netloc != base.netloc: + logger.warning("Skipping off-origin paper URL %s", paper_url) + continue + + paper_id = match.group(1).lower() + file_ext = match.group(2).lower() + filename = match.group(0).lower() + + return { + "url": paper_url, + "filename": filename, + "type": file_ext, + "paper_id": paper_id, + "title": title, + "authors": authors, + "document_date": document_date, + "subgroup": subgroup, + } + + return None def _find_table_in_section(anchor) -> Optional[Tag]: @@ -112,72 +192,15 @@ def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]: return [] paper_urls = [] - paper_pattern = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE) for row in table.find_all("tr"): cells = row.find_all(["td", "th"]) if not cells or any(cell.get("colspan") for cell in cells): continue - # Usually: Number, Title, Author, Date, Subgroup - if len(cells) >= 1: - first_cell = cells[0] - for link in first_cell.find_all("a", href=True): - href = link.get("href", "") - match = paper_pattern.search(href) - if match: - paper_url = urllib.parse.urljoin(url, href) - parsed = urllib.parse.urlparse(paper_url) - base = urllib.parse.urlparse(BASE_URL) - if ( - parsed.scheme not in ("https", "http") - or parsed.netloc != base.netloc - ): - logger.warning("Skipping off-origin paper URL %s", paper_url) - continue - - paper_id = match.group(1).lower() - file_ext = match.group(2).lower() - filename = match.group(0).lower() - - title = "" - if len(cells) > 1: - title = cells[1].text.strip() - - authors = [] - if len(cells) > 2: - authors_raw = cells[2].text.strip() - # Split by comma or 'and' if multiple - if authors_raw: - authors = [ - a.strip() - for a in re.split(r",| and ", authors_raw) - if a.strip() - ] - - document_date = None - if len(cells) > 3: - date_str = cells[3].text.strip() - if date_str: - document_date = date_str # Will be parsed/saved in pipeline - - subgroup = "" - if len(cells) > 4: - subgroup = cells[4].text.strip() - - paper_urls.append( - { - "url": paper_url, - "filename": filename, - "type": file_ext, - "paper_id": paper_id, - "title": title, - "authors": authors, - "document_date": document_date, - "subgroup": subgroup, - } - ) - break # Only take the first paper link in the cell + paper = extract_paper_metadata_from_table_row(cells, url) + if paper: + paper_urls.append(paper) # Remove exact duplicates (same filename) seen = set() diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py index feff138..824617a 100644 --- a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py +++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py @@ -7,10 +7,15 @@ (unknown / Unknown). """ +from __future__ import annotations + import csv import logging import re +from dataclasses import dataclass +from datetime import date from pathlib import Path +from typing import Optional from django.core.management.base import BaseCommand, CommandError from django.db import IntegrityError @@ -93,6 +98,131 @@ def _read_csv_rows(csv_path: Path): yield out +@dataclass(frozen=True) +class _CsvImportRow: + paper_id: str + url: str + mailing_date: str + mailing_title: str + document_date: Optional[date] + year: Optional[int] + title: str + subgroup: str + author_names: list[str] + + +def _parse_csv_import_row(row: dict) -> _CsvImportRow | None: + """Return parsed row, or None when paper_id or url is missing.""" + paper_id = (row.get("paper_id", "") or "").strip().lower() + url = row.get("url", "") + if not paper_id or not url: + return None + + mailing_date, mailing_title = _resolve_mailing_date(row.get("mailing_date", "")) + document_date = _parse_document_date(row.get("date", "")) + if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): + year = int(mailing_date[:4]) + elif document_date is not None: + year = document_date.year + else: + year = None + title = row.get("title", "") or paper_id + subgroup = row.get("subgroup", "") + author_names = _author_names_from_csv(row.get("author", "")) + return _CsvImportRow( + paper_id=paper_id, + url=url, + mailing_date=mailing_date, + mailing_title=mailing_title, + document_date=document_date, + year=year, + title=title, + subgroup=subgroup, + author_names=author_names, + ) + + +def _log_dry_run_row(parsed: _CsvImportRow) -> None: + logger.info( + "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", + parsed.paper_id, + parsed.mailing_date, + parsed.document_date, + len(parsed.author_names), + ) + + +def _attach_csv_authors_to_paper(paper: WG21Paper, author_names: list[str]) -> None: + from cppa_user_tracker.services import ( + get_or_create_wg21_paper_author_profile, + ) + + for i, name in enumerate(author_names): + profile, _ = get_or_create_wg21_paper_author_profile(name) + get_or_create_paper_author(paper, profile, i + 1) + + +def _update_paper_on_integrity_error( + parsed: _CsvImportRow, exc: IntegrityError, stats: dict +) -> None: + mailing, _ = get_or_create_mailing(parsed.mailing_date, parsed.mailing_title) + try: + lookup_year = parsed.year if parsed.year is not None else 0 + paper = WG21Paper.objects.filter( + paper_id=parsed.paper_id, year=lookup_year + ).first() + if paper is None: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, exc) + return + paper.url = parsed.url + paper.title = parsed.title + paper.document_date = parsed.document_date + paper.mailing = mailing + paper.subgroup = parsed.subgroup + if parsed.year is not None: + paper.year = parsed.year + paper.save() + stats["papers_updated"] += 1 + if parsed.author_names: + _attach_csv_authors_to_paper(paper, parsed.author_names) + except Exception: + stats["skipped"] += 1 + logger.exception( + "Error for paper_id=%s (after IntegrityError).", + parsed.paper_id, + ) + + +def _upsert_paper_from_csv_row(parsed: _CsvImportRow, stats: dict) -> None: + try: + mailing, mailing_created = get_or_create_mailing( + parsed.mailing_date, parsed.mailing_title + ) + if mailing_created: + stats["mailings_created"] += 1 + + _paper, paper_created = get_or_create_paper( + paper_id=parsed.paper_id, + url=parsed.url, + title=parsed.title, + document_date=parsed.document_date, + mailing=mailing, + subgroup=parsed.subgroup, + author_names=parsed.author_names if parsed.author_names else None, + year=parsed.year, + ) + if paper_created: + stats["papers_created"] += 1 + else: + stats["papers_updated"] += 1 + except IntegrityError as e: + _update_paper_on_integrity_error(parsed, e, stats) + except Exception as e: + stats["skipped"] += 1 + logger.error("Error for paper_id=%s: %s", parsed.paper_id, e) + + class Command(BaseCommand): help = ( "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). " @@ -133,34 +263,11 @@ def handle(self, *args, **options): for row in _read_csv_rows(csv_path): stats["rows"] += 1 - paper_id = (row.get("paper_id", "") or "").strip().lower() - url = row.get("url", "") - - if not paper_id or not url: - stats["skipped"] += 1 - if stats["skipped"] <= 5: - logger.debug( - "Skipping row: missing paper_id or url: %s", - row.get("paper_id", "") or row.get("url", "")[:50], - ) - continue - - mailing_date, mailing_title = _resolve_mailing_date( - row.get("mailing_date", "") - ) try: - document_date = _parse_document_date(row.get("date", "")) - if mailing_date and MAILING_DATE_PATTERN.match(mailing_date): - year = int(mailing_date[:4]) - elif document_date is not None: - year = document_date.year - else: - year = None - title = row.get("title", "") or paper_id - subgroup = row.get("subgroup", "") - author_names = _author_names_from_csv(row.get("author", "")) + parsed = _parse_csv_import_row(row) except Exception as e: stats["skipped"] += 1 + paper_id = (row.get("paper_id", "") or "").strip().lower() logger.error( "Error parsing document date for paper_id=%s: %s", paper_id, @@ -168,78 +275,20 @@ def handle(self, *args, **options): ) continue + if parsed is None: + stats["skipped"] += 1 + if stats["skipped"] <= 5: + logger.debug( + "Skipping row: missing paper_id or url: %s", + row.get("paper_id", "") or row.get("url", "")[:50], + ) + continue + if dry_run: - logger.info( - "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d", - paper_id, - mailing_date, - document_date, - len(author_names), - ) + _log_dry_run_row(parsed) continue - try: - mailing, mailing_created = get_or_create_mailing( - mailing_date, mailing_title - ) - if mailing_created: - stats["mailings_created"] += 1 - - paper, paper_created = get_or_create_paper( - paper_id=paper_id, - url=url, - title=title, - document_date=document_date, - mailing=mailing, - subgroup=subgroup, - author_names=author_names if author_names else None, - year=year, - ) - if paper_created: - stats["papers_created"] += 1 - else: - stats["papers_updated"] += 1 - except IntegrityError as e: - # Re-resolve mailing (IntegrityError may have come from get_or_create_mailing race) - mailing, _ = get_or_create_mailing(mailing_date, mailing_title) - # Duplicate (paper_id, year): fetch existing by same key and update - try: - lookup_year = year if year is not None else 0 - paper = WG21Paper.objects.filter( - paper_id=paper_id, year=lookup_year - ).first() - if paper is None: - stats["skipped"] += 1 - logger.error("Error for paper_id=%s: %s", paper_id, e) - else: - paper.url = url - paper.title = title - paper.document_date = document_date - paper.mailing = mailing - paper.subgroup = subgroup - if year is not None: - paper.year = year - paper.save() - stats["papers_updated"] += 1 - if author_names: - from cppa_user_tracker.services import ( - get_or_create_wg21_paper_author_profile, - ) - - for i, name in enumerate(author_names): - profile, _ = get_or_create_wg21_paper_author_profile( - name - ) - get_or_create_paper_author(paper, profile, i + 1) - except Exception: - stats["skipped"] += 1 - logger.exception( - "Error for paper_id=%s (after IntegrityError).", - paper_id, - ) - except Exception as e: - stats["skipped"] += 1 - logger.error("Error for paper_id=%s: %s", paper_id, e) + _upsert_paper_from_csv_row(parsed, stats) logger.info( "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d", diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py index b1885af..3f0965d 100644 --- a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py +++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py @@ -1,97 +1,158 @@ """ Management command for WG21 Paper Tracker. -Runs the pipeline to fetch new mailings, download papers, upload to GCS, and update DB. -If new papers were found and uploaded, it triggers the Google Cloud Run conversion job. +Runs the pipeline to fetch new mailings, upsert paper metadata in the DB, and optionally +trigger a GitHub repository_dispatch so another repo can download and convert documents. """ import logging -from django.core.management.base import BaseCommand + +import requests from django.conf import settings +from django.core.management.base import BaseCommand, CommandError from wg21_paper_tracker.pipeline import run_tracker_pipeline logger = logging.getLogger(__name__) - -def trigger_cloud_run_job(project_id: str, location: str, job_name: str): - """ - Start the named Cloud Run job (run once, no polling). - - Uses the Cloud Run v2 API to trigger the job identified by project_id, - location, and job_name. The job runs asynchronously; this function returns - the operation and does not wait for the job to finish. - """ - from google.cloud import run_v2 - - client = run_v2.JobsClient() - name = client.job_path(project_id, location, job_name) - request = run_v2.RunJobRequest(name=name) - logger.info("Triggering Cloud Run job %s...", name) - operation = client.run_job(request=request) - logger.info("Cloud Run job triggered. Operation: %s", operation.operation.name) - return operation +GITHUB_DISPATCH_URL = "https://api.github.com/repos/{repo}/dispatches" + + +def trigger_github_repository_dispatch( + repo: str, + event_type: str, + token: str, + paper_urls: list[str], +) -> None: + """POST repository_dispatch with client_payload {"papers": [, ...]}.""" + url = GITHUB_DISPATCH_URL.format(repo=repo.strip()) + headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token.strip()}", + "X-GitHub-Api-Version": "2022-11-28", + } + body = { + "event_type": event_type, + "client_payload": {"papers": paper_urls}, + } + logger.info( + "Sending repository_dispatch to %s (event_type=%s, %d URLs).", + repo, + event_type, + len(paper_urls), + ) + response = requests.post(url, json=body, headers=headers, timeout=30) + if not response.ok: + logger.error( + "GitHub repository_dispatch failed: %s %s", + response.status_code, + response.text, + ) + response.raise_for_status() class Command(BaseCommand): - """Run WG21 paper tracker and optionally trigger the Cloud Run conversion job.""" + """Run WG21 paper tracker and optionally trigger GitHub repository_dispatch.""" - help = "Run WG21 paper tracker (fetch, download to GCS, DB update) and trigger Cloud Run if new papers." + help = ( + "Run WG21 paper tracker (scrape, DB update) and send new paper URLs via " + "repository_dispatch when enabled." + ) def add_arguments(self, parser): - """Register --dry-run so the command can skip pipeline and Cloud Run.""" parser.add_argument( "--dry-run", action="store_true", - help="Only log what would be done; do not run the pipeline or trigger Cloud Run.", + help="Only log what would be done; do not run the pipeline or dispatch.", + ) + parser.add_argument( + "--from-date", + dest="from_date", + metavar="YYYY-MM", + default=None, + help=( + "Process mailings with mailing_date >= YYYY-MM (WG21 / CSV style). " + "Backfills from that mailing onward; without --to-date, no upper cap." + ), + ) + parser.add_argument( + "--to-date", + dest="to_date", + metavar="YYYY-MM", + default=None, + help=( + "Upper bound: mailing_date <= YYYY-MM. With --from-date, inclusive range; " + "without --from-date, still only mailings newer than DB latest (capped at to)." + ), ) def handle(self, *args, **options): - """ - Run the tracker pipeline; if new papers were uploaded, trigger the Cloud Run job. - - With --dry-run, logs and exits without running the pipeline or triggering Cloud Run. - Otherwise runs the pipeline, then triggers the configured Cloud Run job when - total_new_papers > 0, WG21_CLOUD_RUN_ENABLED is True, and - GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and WG21_GCS_BUCKET are set. - """ dry_run = options.get("dry_run", False) + from_date = options.get("from_date") + to_date = options.get("to_date") + if from_date is not None: + from_date = from_date.strip() + if not from_date: + from_date = None + if to_date is not None: + to_date = to_date.strip() + if not to_date: + to_date = None if dry_run: - logger.info("Dry run: skipping pipeline and Cloud Run trigger.") + if from_date or to_date: + logger.info( + "Dry run: skipping pipeline and GitHub dispatch " + "(from=%r, to=%r).", + from_date, + to_date, + ) + else: + logger.info("Dry run: skipping pipeline and GitHub dispatch.") return logger.info("Starting WG21 Paper Tracker...") try: - total_new_papers = run_tracker_pipeline() - logger.info("Processed %d new papers.", total_new_papers) - - if total_new_papers > 0: - project_id = getattr(settings, "GCP_PROJECT_ID", None) - location = getattr(settings, "GCP_LOCATION", "us-central1") - job_name = getattr(settings, "WG21_CLOUD_RUN_JOB_NAME", None) - bucket = getattr(settings, "WG21_GCS_BUCKET", None) - cloud_run_enabled = getattr(settings, "WG21_CLOUD_RUN_ENABLED", False) - - if project_id and job_name and bucket and cloud_run_enabled: - try: - trigger_cloud_run_job(project_id, location, job_name) - logger.info( - "Successfully triggered Cloud Run job %s.", job_name - ) - except Exception: - logger.exception( - "Failed to trigger Cloud Run job %s.", job_name - ) - raise - else: - logger.warning( - "Skipping Cloud Run trigger: set WG21_CLOUD_RUN_ENABLED=True " - "and configure GCP_PROJECT_ID, WG21_CLOUD_RUN_JOB_NAME, and " - "WG21_GCS_BUCKET to enable." - ) - else: - logger.info("No new papers found. Skipping Cloud Run job.") - + result = run_tracker_pipeline( + from_mailing_date=from_date, + to_mailing_date=to_date, + ) + n = result.new_paper_count + logger.info("Recorded %d new paper(s); %d URL(s) for dispatch.", n, n) + + if not n: + logger.info("No new papers in this run. Skipping GitHub dispatch.") + return + + repo = getattr(settings, "WG21_GITHUB_DISPATCH_REPO", "") or "" + token = getattr(settings, "WG21_GITHUB_DISPATCH_TOKEN", "") or "" + enabled = getattr(settings, "WG21_GITHUB_DISPATCH_ENABLED", False) + event_type = getattr( + settings, + "WG21_GITHUB_DISPATCH_EVENT_TYPE", + "wg21_papers_convert", + ) + + if not enabled or not repo or not token: + logger.warning( + "Skipping GitHub dispatch: set WG21_GITHUB_DISPATCH_ENABLED=True " + "and configure WG21_GITHUB_DISPATCH_REPO and " + "WG21_GITHUB_DISPATCH_TOKEN." + ) + return + try: + trigger_github_repository_dispatch( + repo, + event_type, + token, + list(result.new_paper_urls), + ) + logger.info("repository_dispatch sent successfully.") + except Exception: + logger.exception("Failed to send repository_dispatch.") + raise + + except ValueError as e: + raise CommandError(str(e)) from e except Exception as e: logger.exception("WG21 Paper Tracker failed: %s", e) raise diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index 516837e..ff15f50 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -1,114 +1,293 @@ """ Pipeline for WG21 Paper Tracker. -Coordinates scraping, downloading, uploading to GCS, and updating the database. +Coordinates scraping and updating the database (metadata only; no file download or GCS). """ -import time -import requests +from __future__ import annotations + import logging -from pathlib import Path +import re +from dataclasses import dataclass, field +from datetime import date, datetime +from typing import Any, Optional -from django.conf import settings -from google.cloud import storage +from django.utils.dateparse import parse_date from wg21_paper_tracker.fetcher import ( fetch_all_mailings, fetch_papers_for_mailing, ) -from wg21_paper_tracker.models import WG21Mailing, WG21Paper +from wg21_paper_tracker.models import WG21Mailing from wg21_paper_tracker.services import ( get_or_create_mailing, get_or_create_paper, ) -from wg21_paper_tracker.workspace import get_raw_dir logger = logging.getLogger(__name__) -DOWNLOAD_TIMEOUT = 30 -DOWNLOAD_MAX_RETRIES = 3 -DOWNLOAD_RETRY_DELAY = 2 +# WG21 mailing_date and typical CSV column (e.g. 2025-03, 2026-01) +_MAILING_DATE_LABEL_RE = re.compile(r"^\d{4}-\d{2}$") + + +def _normalize_mailing_date_label(label: str, *, field_name: str) -> str: + s = label.strip() + if not _MAILING_DATE_LABEL_RE.match(s): + raise ValueError( + f"Invalid {field_name} {label!r}; " + "expected YYYY-MM (e.g. 2025-03), same as WG21 / CSV mailing keys." + ) + return s -def _upload_to_gcs( - bucket_name: str, source_path: Path, destination_blob_name: str +def _mailing_date_in_run_scope( + mailing_date: str, + *, + latest_date: str, + from_mailing_date: Optional[str], + to_mailing_date: Optional[str], ) -> bool: - """Uploads a file to the bucket.""" + """Whether a mailing key is selected for this run (before retry merge).""" + if from_mailing_date is None and to_mailing_date is None: + return mailing_date > latest_date + + if from_mailing_date is not None and mailing_date < from_mailing_date: + return False + if to_mailing_date is not None and mailing_date > to_mailing_date: + return False + if from_mailing_date is None and to_mailing_date is not None: + return mailing_date > latest_date + return True + + +def _format_priority(ext: str) -> int: + """Prefer adoc > html > ps > pdf when multiple formats exist for one paper_id.""" + priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} + return priorities.get(ext.lower(), 100) + + +def _parse_mailing_year(m_info: dict) -> int: + """Return 4-digit year from the index mailing dict, or 0 if missing/invalid.""" + mailing_date = m_info["mailing_date"] + year_raw = m_info.get("year") + if not year_raw or not str(year_raw).strip(): + logger.warning( + "Mailing %s: year missing or empty, using 0 (fix later).", + mailing_date, + ) + return 0 try: - storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name) - blob = bucket.blob(destination_blob_name) + year = int(str(year_raw).strip()[:4]) + except (ValueError, TypeError): + logger.warning( + "Mailing %s: year not parseable %r, using 0 (fix later).", + mailing_date, + year_raw, + ) + return 0 + if year <= 0 or year > datetime.now().year + 1: + logger.warning( + "Mailing %s: year invalid, using 0 (fix later).", + mailing_date, + ) + return 0 + return year - blob.upload_from_filename(str(source_path)) - logger.info( - "Uploaded %s to gs://%s/%s", - source_path.name, - bucket_name, - destination_blob_name, + +def _group_fetched_papers_by_id( + papers: list[dict[str, Any]], mailing_date: str +) -> dict[str, list[dict[str, Any]]]: + """Bucket fetcher rows by normalized paper_id.""" + papers_by_id: dict[str, list[dict[str, Any]]] = {} + for p in papers: + pid = (p.get("paper_id") or "").strip().lower() + if not pid: + logger.warning( + "Skipping paper entry without a paper_id in mailing %s: %r", + mailing_date, + p, + ) + continue + papers_by_id.setdefault(pid, []).append(p) + return papers_by_id + + +def _valid_paper_entries_for_id( + p_list: list[dict[str, Any]], pid: str, mailing_date: str +) -> list[dict[str, Any]]: + """Keep rows that have type, url, and title (all non-empty).""" + valid: list[dict[str, Any]] = [] + for p in p_list: + type_val = ( + (p.get("type") or "").strip() if isinstance(p.get("type"), str) else "" + ) + url_val = (p.get("url") or "").strip() if isinstance(p.get("url"), str) else "" + title_val = ( + (p.get("title") or "").strip() if isinstance(p.get("title"), str) else "" ) - return True + if not type_val or not url_val or not title_val: + logger.debug( + "Skipping malformed paper entry for %s in mailing %s: %r", + pid, + mailing_date, + p, + ) + continue + valid.append(p) + return valid + + +def _choose_best_format_entry(valid_list: list[dict[str, Any]]) -> dict[str, Any]: + """Pick one row by format priority (adoc first). Precondition: valid_list non-empty.""" + return min( + valid_list, + key=lambda x: _format_priority(str(x.get("type") or "").strip()), + ) + + +def _parse_scraped_document_date(doc_date_str: Any) -> Optional[date]: + if not doc_date_str: + return None + try: + return parse_date(str(doc_date_str).strip()) except Exception as e: - logger.error("Failed to upload to GCS: %s", e) - return False + logger.warning( + "Failed to parse document date: %s: %s", + doc_date_str, + e, + ) + return None + + +def _upsert_paper_from_scraped_row( + pid: str, + best_paper: dict[str, Any], + mailing_obj: WG21Mailing, + year: int, + mailing_date: str, +) -> Optional[str]: + """ + Create or update WG21Paper from the chosen fetcher row. + Returns the document URL if a **new** row was inserted, else None. + """ + url = (best_paper.get("url") or "").strip() + paper_title = (best_paper.get("title") or "").strip() + subgroup = (best_paper.get("subgroup") or "").strip() + authors = best_paper.get("authors") + if not isinstance(authors, list): + authors = [] + if not url or not paper_title: + logger.warning( + "Skipping paper %s in mailing %s due to missing required fields: %r", + pid, + mailing_date, + best_paper, + ) + return None + + doc_date = _parse_scraped_document_date(best_paper.get("document_date")) + _paper_obj, created = get_or_create_paper( + paper_id=pid, + url=url, + title=paper_title, + document_date=doc_date, + mailing=mailing_obj, + subgroup=subgroup, + author_names=authors, + year=year, + ) + return url if created else None -def _download_file(url: str, filepath: Path) -> bool: - """Download file from URL to filepath with retries and 30s timeout.""" - for attempt in range(1, DOWNLOAD_MAX_RETRIES + 1): - try: - logger.info( - "Downloading %s to %s (attempt %d/%d)", - url, - filepath, - attempt, - DOWNLOAD_MAX_RETRIES, +def _process_single_mailing(m_info: dict) -> list[str]: + """ + For one mailing from the index: normalize year, get/create WG21Mailing, + fetch paper rows from the site, upsert WG21Paper rows. + + Returns URLs for papers **newly created** in this run for this mailing. + """ + mailing_date = m_info["mailing_date"] + title = m_info["title"] + year = _parse_mailing_year(m_info) + mailing_obj, _ = get_or_create_mailing(mailing_date, title) + + papers = fetch_papers_for_mailing(str(year), mailing_date) + if not papers: + logger.info( + "Mailing %s: no papers found (anchor/table may be missing).", + mailing_date, + ) + return [] + + papers_by_id = _group_fetched_papers_by_id(papers, mailing_date) + new_urls: list[str] = [] + + for pid, p_list in papers_by_id.items(): + valid_list = _valid_paper_entries_for_id(p_list, pid, mailing_date) + if not valid_list: + logger.warning( + "Skipping paper %s in mailing %s: no valid entries (type, url, title)", + pid, + mailing_date, ) - response = requests.get(url, timeout=DOWNLOAD_TIMEOUT, stream=True) - response.raise_for_status() - - # For text-based files, save as UTF-8. For binary (like PDF), save as bytes. - content_type = response.headers.get("content-type", "") - if "text" in content_type: - with open(filepath, "w", encoding="utf-8") as f: - f.write( - response.content.decode( - response.apparent_encoding or "utf-8", - errors="replace", - ) - ) - else: - with open(filepath, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) - return True - except Exception as e: - if attempt < DOWNLOAD_MAX_RETRIES: - logger.warning( - "Download attempt %d/%d failed for %s: %s. Retrying in %ds.", - attempt, - DOWNLOAD_MAX_RETRIES, - url, - e, - DOWNLOAD_RETRY_DELAY, - ) - time.sleep(DOWNLOAD_RETRY_DELAY) - else: - logger.error( - "Failed to download %s after %d attempts: %s", - url, - DOWNLOAD_MAX_RETRIES, - e, - ) - return False - - -def run_tracker_pipeline() -> int: + continue + best_paper = _choose_best_format_entry(valid_list) + url = _upsert_paper_from_scraped_row( + pid, best_paper, mailing_obj, year, mailing_date + ) + if url: + new_urls.append(url) + + return new_urls + + +@dataclass(frozen=True) +class TrackerPipelineResult: + """Result of run_tracker_pipeline: URLs for papers newly created in this run.""" + + new_paper_urls: tuple[str, ...] = field(default_factory=tuple) + + @property + def new_paper_count(self) -> int: + return len(self.new_paper_urls) + + +def run_tracker_pipeline( + *, + from_mailing_date: Optional[str] = None, + to_mailing_date: Optional[str] = None, +) -> TrackerPipelineResult: """ - Run the WG21 tracker pipeline. - Returns the number of new papers downloaded and uploaded. + Run the WG21 tracker pipeline: scrape mailings, upsert papers in the DB. + Returns URLs for rows created in this run (for GitHub repository_dispatch). + + Mailing keys are ``YYYY-MM`` (WG21 / CSV style). Selection: + + - Neither ``from_mailing_date`` nor ``to_mailing_date``: process mailings with + ``mailing_date`` strictly newer than the latest ``WG21Mailing`` in the DB. + - ``from_mailing_date`` only: ``mailing_date >= from_mailing_date``. + - ``to_mailing_date`` only: ``mailing_date > latest_in_db`` and + ``mailing_date <= to_mailing_date`` (incremental runs capped at ``to``). + - Both: ``from_mailing_date <= mailing_date <= to_mailing_date`` (inclusive). + + ``from_mailing_date`` must not be lexicographically after ``to_mailing_date``. """ - bucket_name = settings.WG21_GCS_BUCKET - if not bucket_name: - logger.warning("WG21_GCS_BUCKET not set. Will download but not upload to GCS.") + if from_mailing_date is not None: + from_mailing_date = _normalize_mailing_date_label( + from_mailing_date, field_name="from_mailing_date" + ) + if to_mailing_date is not None: + to_mailing_date = _normalize_mailing_date_label( + to_mailing_date, field_name="to_mailing_date" + ) + if ( + from_mailing_date is not None + and to_mailing_date is not None + and from_mailing_date > to_mailing_date + ): + raise ValueError( + f"from_mailing_date {from_mailing_date!r} is after " + f"to_mailing_date {to_mailing_date!r}." + ) # 1. Get latest mailing from DB latest_mailing = ( @@ -122,23 +301,49 @@ def run_tracker_pipeline() -> int: all_mailings = fetch_all_mailings() if not all_mailings: logger.warning("No mailings found on WG21 site.") - return 0 + return TrackerPipelineResult() + + # Filter mailings to process + new_mailings = [ + m + for m in all_mailings + if _mailing_date_in_run_scope( + m["mailing_date"], + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + ] + if from_mailing_date is None and to_mailing_date is None: + baseline_desc = f"latest_in_db={latest_date}" + else: + parts: list[str] = [] + if from_mailing_date is not None: + parts.append(f"from={from_mailing_date}") + if to_mailing_date is not None: + parts.append(f"to={to_mailing_date}") + if from_mailing_date is None: + parts.append(f"latest_in_db={latest_date}") + baseline_desc = ", ".join(parts) - # Filter newer mailings - new_mailings = [m for m in all_mailings if m["mailing_date"] > latest_date] # Requeue incomplete mailings so transient failures get retried (not just the latest) retry_dates = set( WG21Mailing.objects.filter(papers__isnull=True).values_list( "mailing_date", flat=True ) ) - retry_dates.update( - WG21Mailing.objects.filter(papers__is_downloaded=False).values_list( - "mailing_date", flat=True - ) - ) if latest_mailing: retry_dates.add(latest_mailing.mailing_date) + retry_dates = { + d + for d in retry_dates + if _mailing_date_in_run_scope( + d, + latest_date=latest_date, + from_mailing_date=from_mailing_date, + to_mailing_date=to_mailing_date, + ) + } for current_m in all_mailings: if current_m["mailing_date"] in retry_dates and current_m[ "mailing_date" @@ -149,211 +354,13 @@ def run_tracker_pipeline() -> int: new_mailings.sort(key=lambda x: x["mailing_date"]) logger.info( - "Pipeline: latest_date=%s, all_mailings=%d, mailings_to_process=%s", - latest_date, + "Pipeline: %s, all_mailings=%d, mailings_to_process=%s", + baseline_desc, len(all_mailings), [m["mailing_date"] for m in new_mailings], ) - total_new_papers = 0 - + new_urls: list[str] = [] for m_info in new_mailings: - mailing_date = m_info["mailing_date"] - title = m_info["title"] - # Normalize year once; use 0 when missing/empty/unparseable so you can fix later - year_raw = m_info.get("year") - if not year_raw or not str(year_raw).strip(): - year = 0 - logger.warning( - "Mailing %s: year missing or empty, using 0 (fix later).", - mailing_date, - ) - else: - try: - year = int(str(year_raw).strip()[:4]) - if year <= 0: - year = 0 - logger.warning( - "Mailing %s: year invalid, using 0 (fix later).", - mailing_date, - ) - except (ValueError, TypeError): - year = 0 - logger.warning( - "Mailing %s: year not parseable %r, using 0 (fix later).", - mailing_date, - year_raw, - ) - - # Create/get mailing in DB - mailing_obj, _ = get_or_create_mailing(mailing_date, title) - - # Fetch papers for this mailing - papers = fetch_papers_for_mailing(str(year), mailing_date) - if not papers: - logger.info( - "Mailing %s: no papers found (anchor/table may be missing).", - mailing_date, - ) - continue - - # Group papers by ID so we can choose the preferred source format per paper. - papers_by_id = {} - for p in papers: - pid = (p.get("paper_id") or "").strip().lower() - if not pid: - logger.warning( - "Skipping paper entry without a paper_id in mailing %s: %r", - mailing_date, - p, - ) - continue - if pid not in papers_by_id: - papers_by_id[pid] = [] - papers_by_id[pid].append(p) - - def format_priority(ext: str) -> int: - priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4} - return priorities.get(ext.lower(), 100) - - raw_dir = get_raw_dir(mailing_date, year) - - skipped_downloaded = 0 - for pid, p_list in papers_by_id.items(): - # Skip only if this (paper_id, year) is already downloaded - if WG21Paper.objects.filter( - paper_id=pid, - year=year, - is_downloaded=True, - ).exists(): - skipped_downloaded += 1 - continue - - # Filter to entries with required keys and valid types; skip malformed. - valid_list = [] - for p in p_list: - type_val = ( - (p.get("type") or "").strip() - if isinstance(p.get("type"), str) - else "" - ) - url_val = ( - (p.get("url") or "").strip() - if isinstance(p.get("url"), str) - else "" - ) - title_val = ( - (p.get("title") or "").strip() - if isinstance(p.get("title"), str) - else "" - ) - if not type_val or not url_val or not title_val: - logger.debug( - "Skipping malformed paper entry for %s in mailing %s: %r", - pid, - mailing_date, - p, - ) - continue - valid_list.append(p) - - if not valid_list: - logger.warning( - "Skipping paper %s in mailing %s: no valid entries (type, url, title)", - pid, - mailing_date, - ) - continue - - # Pick the preferred format: adoc > html > ps > pdf. - best_paper = min( - valid_list, - key=lambda x: format_priority(str(x.get("type") or "").strip()), - ) - url = (best_paper.get("url") or "").strip() - title = (best_paper.get("title") or "").strip() - subgroup = (best_paper.get("subgroup") or "").strip() - authors = best_paper.get("authors") - if not isinstance(authors, list): - authors = [] - if not url or not title: - logger.warning( - "Skipping paper %s in mailing %s due to missing required fields: %r", - pid, - mailing_date, - best_paper, - ) - continue - - raw_filename = (best_paper.get("filename") or "").strip() - filename = Path(raw_filename).name - if not filename or filename != raw_filename: - logger.warning( - "Skipping paper %s due to unsafe filename %r", - pid, - raw_filename, - ) - continue - local_path = raw_dir / filename - - # Persist paper row before transfer so failed downloads remain retry candidates - doc_date_str = best_paper.get("document_date") - from django.utils.dateparse import parse_date - - doc_date = None - if doc_date_str: - try: - doc_date = parse_date(doc_date_str) - except Exception as e: - logger.warning( - "Failed to parse document date: %s: %s", - doc_date_str, - e, - ) - doc_date = None - - paper_obj, _created = get_or_create_paper( - paper_id=pid, - url=url, - title=title, - document_date=doc_date, - mailing=mailing_obj, - subgroup=subgroup, - author_names=authors, - year=year, - ) - - # Download - if _download_file(url, local_path): - uploaded = False - if bucket_name: - gcs_path = ( - f"raw/wg21_paper_tracker/{year}/{mailing_date}/{filename}" - ) - uploaded = _upload_to_gcs(bucket_name, local_path, gcs_path) - else: - logger.warning( - "WG21_GCS_BUCKET is not configured; leaving %s as not downloaded.", - pid, - ) - - if uploaded: - paper_obj.is_downloaded = True - paper_obj.save(update_fields=["is_downloaded", "updated_at"]) - total_new_papers += 1 - - # Clean up local file to save space - # try: - # # local_path.unlink() - # except Exception as e: - # logger.warning( - # "Could not delete temp file %s: %s", local_path, e - # ) - - if skipped_downloaded: - logger.info( - "Mailing %s: skipped %d papers (already downloaded).", - mailing_date, - skipped_downloaded, - ) + new_urls.extend(_process_single_mailing(m_info)) - return total_new_papers + return TrackerPipelineResult(new_paper_urls=tuple(new_urls)) diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py index 34a52e9..a099ccb 100644 --- a/wg21_paper_tracker/tests/test_commands.py +++ b/wg21_paper_tracker/tests/test_commands.py @@ -1,12 +1,18 @@ """Tests for wg21_paper_tracker management commands.""" +from unittest.mock import MagicMock, patch + import pytest from django.core.management import call_command from django.core.management.base import CommandError +from django.test.utils import override_settings + +from wg21_paper_tracker.pipeline import TrackerPipelineResult CMD_NAME = "import_wg21_metadata_from_csv" +RUN_TRACKER_CMD = "run_wg21_paper_tracker" def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): @@ -16,3 +22,117 @@ def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path): with pytest.raises(CommandError, match=r"File not found:"): call_command(CMD_NAME, f"--csv-file={csv_path}") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_posts_dispatch_when_enabled(): + """run_wg21_paper_tracker sends repository_dispatch with papers URL list.""" + mock_resp = MagicMock() + mock_resp.ok = True + mock_resp.status_code = 204 + mock_resp.text = "" + + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult( + new_paper_urls=("https://open-std.org/a.pdf", "https://open-std.org/b.pdf") + ), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + return_value=mock_resp, + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="myorg/convert-repo", + WG21_GITHUB_DISPATCH_TOKEN="secret-token", + WG21_GITHUB_DISPATCH_EVENT_TYPE="wg21_papers_convert", + ): + call_command(RUN_TRACKER_CMD) + + m_post.assert_called_once() + assert m_post.call_args[0][0] == ( + "https://api.github.com/repos/myorg/convert-repo/dispatches" + ) + body = m_post.call_args[1]["json"] + assert body["event_type"] == "wg21_papers_convert" + assert body["client_payload"] == { + "papers": [ + "https://open-std.org/a.pdf", + "https://open-std.org/b.pdf", + ], + } + headers = m_post.call_args[1]["headers"] + assert headers["Authorization"] == "Bearer secret-token" + assert headers["Accept"] == "application/vnd.github+json" + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_no_new_papers(): + """No HTTP request when pipeline returns no new URLs.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=True, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_skips_post_when_dispatch_disabled(): + """No HTTP request when WG21_GITHUB_DISPATCH_ENABLED is False.""" + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(new_paper_urls=("https://x/y",)), + ): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post", + ) as m_post: + with override_settings( + WG21_GITHUB_DISPATCH_ENABLED=False, + WG21_GITHUB_DISPATCH_REPO="o/r", + WG21_GITHUB_DISPATCH_TOKEN="t", + ): + call_command(RUN_TRACKER_CMD) + m_post.assert_not_called() + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_from_date(): + """--from-date must be YYYY-MM.""" + with pytest.raises(CommandError, match="Invalid from_mailing_date"): + call_command(RUN_TRACKER_CMD, "--from-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-03", to_mailing_date=None) + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_rejects_invalid_to_date(): + with pytest.raises(CommandError, match="Invalid to_mailing_date"): + call_command(RUN_TRACKER_CMD, "--to-date=bad") + + +@pytest.mark.django_db +def test_run_wg21_paper_tracker_passes_from_and_to_date_to_pipeline(): + with patch( + "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline", + return_value=TrackerPipelineResult(), + ) as m: + call_command(RUN_TRACKER_CMD, "--from-date=2025-01", "--to-date=2025-03") + m.assert_called_once_with(from_mailing_date="2025-01", to_mailing_date="2025-03") diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py index 3b903fb..93b2158 100644 --- a/wg21_paper_tracker/tests/test_fetcher.py +++ b/wg21_paper_tracker/tests/test_fetcher.py @@ -3,9 +3,11 @@ from unittest.mock import patch, MagicMock import requests +from bs4 import BeautifulSoup from wg21_paper_tracker.fetcher import ( BASE_URL, + extract_paper_metadata_from_table_row, fetch_all_mailings, fetch_papers_for_mailing, ) @@ -204,3 +206,69 @@ def test_fetch_papers_for_mailing_calls_year_url(): ) fetch_papers_for_mailing("2025", "2025-01") m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30) + + +# --- extract_paper_metadata_from_table_row --- + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_cells(): + """Empty cell list yields no paper.""" + assert extract_paper_metadata_from_table_row([], f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_returns_none_when_no_paper_link(): + """Row without a matching paper href returns None.""" + html = "No link heret" + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + assert extract_paper_metadata_from_table_row(cells, f"{BASE_URL}/2025/") is None + + +def test_extract_paper_metadata_from_table_row_parses_legacy_five_column_row(): + """Older tables: Number, Title, Author, Document date, Subgroup (subgroup at index 4).""" + html = """ + + P1234R0 + My title + Author One, Author Two + 2025-03-15 + LEWG + + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2025/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1234r0" + assert result["type"] == "pdf" + assert result["filename"] == "p1234r0.pdf" + assert result["url"] == f"{BASE_URL}/2025/p1234r0.pdf" + assert result["title"] == "My title" + assert result["authors"] == ["Author One", "Author Two"] + assert result["document_date"] == "2025-03-15" + assert result["subgroup"] == "LEWG" + + +def test_extract_paper_metadata_from_table_row_parses_eight_column_row(): + """2026+ style: subgroup is column 7 (index 6), not index 4 (mailing date).""" + html = """ + + P1000R7 + C++ IS Schedule (proposed) + Herb Sutter + 2026-01-13 + 2026-01 + P1000R6 + All of WG21 + + + """ + row = BeautifulSoup(html, "html.parser").find("tr") + cells = row.find_all(["td", "th"]) + page_url = f"{BASE_URL}/2026/" + result = extract_paper_metadata_from_table_row(cells, page_url) + assert result is not None + assert result["paper_id"] == "p1000r7" + assert result["document_date"] == "2026-01-13" + assert result["subgroup"] == "All of WG21" diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py index ad56a29..099f105 100644 --- a/wg21_paper_tracker/tests/test_pipeline.py +++ b/wg21_paper_tracker/tests/test_pipeline.py @@ -1,111 +1,27 @@ """Tests for wg21_paper_tracker.pipeline.""" -from unittest.mock import patch, MagicMock +from unittest.mock import patch import pytest -import requests - -from wg21_paper_tracker.pipeline import ( - DOWNLOAD_TIMEOUT, - DOWNLOAD_MAX_RETRIES, - _download_file, - run_tracker_pipeline, -) - - -# --- _download_file --- - - -def test_download_file_success_text(tmp_path): - """_download_file saves text response and returns True.""" - url = "https://example.com/doc.html" - filepath = tmp_path / "doc.html" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/html; charset=utf-8"} - resp.content = b"Hello" - resp.apparent_encoding = "utf-8" - resp.iter_content = None - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): - result = _download_file(url, filepath) - assert result is True - assert filepath.read_text(encoding="utf-8") == "Hello" - - -def test_download_file_success_binary(tmp_path): - """_download_file saves binary response and returns True.""" - url = "https://example.com/doc.pdf" - filepath = tmp_path / "doc.pdf" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "application/pdf"} - resp.iter_content = lambda chunk_size: (b"\x25\x50\x44\x46",) - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp): - result = _download_file(url, filepath) - assert result is True - assert filepath.read_bytes() == b"\x25\x50\x44\x46" - - -def test_download_file_uses_timeout(tmp_path): - """_download_file calls requests.get with DOWNLOAD_TIMEOUT.""" - url = "https://example.com/f" - filepath = tmp_path / "out" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/plain"} - resp.content = b"x" - resp.apparent_encoding = "utf-8" - with patch("wg21_paper_tracker.pipeline.requests.get", return_value=resp) as m: - _download_file(url, filepath) - m.assert_called_once() - assert m.call_args[1]["timeout"] == DOWNLOAD_TIMEOUT - - -def test_download_file_retries_on_failure(tmp_path): - """_download_file retries up to DOWNLOAD_MAX_RETRIES then returns False.""" - url = "https://example.com/f" - filepath = tmp_path / "f" - with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = requests.RequestException("connection error") - with patch("wg21_paper_tracker.pipeline.time.sleep") as sleep_mock: - result = _download_file(url, filepath) - assert result is False - assert m.call_count == DOWNLOAD_MAX_RETRIES - assert sleep_mock.call_count == DOWNLOAD_MAX_RETRIES - 1 - - -def test_download_file_succeeds_on_second_attempt(tmp_path): - """_download_file succeeds when a retry succeeds.""" - url = "https://example.com/f" - filepath = tmp_path / "f" - resp = MagicMock() - resp.raise_for_status = MagicMock() - resp.headers = {"content-type": "text/plain"} - resp.content = b"ok" - resp.apparent_encoding = "utf-8" - with patch("wg21_paper_tracker.pipeline.requests.get") as m: - m.side_effect = [requests.RequestException("first fail"), resp] - with patch("wg21_paper_tracker.pipeline.time.sleep"): - result = _download_file(url, filepath) - assert result is True - assert m.call_count == 2 - assert filepath.read_text() == "ok" + +from wg21_paper_tracker.pipeline import TrackerPipelineResult, run_tracker_pipeline # --- run_tracker_pipeline --- @pytest.mark.django_db -def test_run_tracker_pipeline_returns_zero_when_no_mailings(): - """run_tracker_pipeline returns 0 when fetch_all_mailings returns [].""" +def test_run_tracker_pipeline_returns_empty_when_no_mailings(): + """run_tracker_pipeline returns empty result when fetch_all_mailings returns [].""" with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]): - n = run_tracker_pipeline() - assert n == 0 + result = run_tracker_pipeline() + assert result.new_paper_count == 0 + assert result.new_paper_urls == () @pytest.mark.django_db def test_run_tracker_pipeline_skips_when_no_new_mailings(): - """run_tracker_pipeline returns 0 when all mailings are older than or equal to latest in DB.""" + """run_tracker_pipeline returns empty when all mailings are <= latest in DB.""" from wg21_paper_tracker.models import WG21Mailing WG21Mailing.objects.create(mailing_date="2025-02", title="Latest") @@ -117,13 +33,13 @@ def test_run_tracker_pipeline_skips_when_no_new_mailings(): with patch( "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[] ): - n = run_tracker_pipeline() - assert n == 0 + result = run_tracker_pipeline() + assert result.new_paper_count == 0 @pytest.mark.django_db -def test_run_tracker_pipeline_downloads_new_papers(tmp_path): - """run_tracker_pipeline downloads papers for new mailings and returns count.""" +def test_run_tracker_pipeline_collects_urls_for_new_papers(): + """run_tracker_pipeline returns URLs for papers created in this run.""" from wg21_paper_tracker.models import WG21Mailing WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") @@ -147,19 +63,159 @@ def test_run_tracker_pipeline_downloads_new_papers(tmp_path): with patch( "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers ): - with patch( - "wg21_paper_tracker.pipeline.get_raw_dir", return_value=tmp_path - ): - with patch( - "wg21_paper_tracker.pipeline._download_file", return_value=True - ): - with patch( - "wg21_paper_tracker.pipeline.settings.WG21_GCS_BUCKET", - "test-bucket", - ): - with patch( - "wg21_paper_tracker.pipeline._upload_to_gcs", - return_value=True, - ): - n = run_tracker_pipeline() - assert n == 1 + result = run_tracker_pipeline() + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1000r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_from_mailing_date_backfills_older_than_db_latest(): + """from_mailing_date includes mailings >= date even when DB latest is newer.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-02", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Older", "year": "2025"}, + {"mailing_date": "2025-02", "title": "Latest in DB", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1111r0", + "url": "https://example.com/p1111r0.pdf", + "filename": "p1111r0.pdf", + "title": "January paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + result = run_tracker_pipeline(from_mailing_date="2025-01") + assert result.new_paper_count == 1 + assert result.new_paper_urls == ("https://example.com/p1111r0.pdf",) + + +@pytest.mark.django_db +def test_run_tracker_pipeline_second_run_no_new_urls(): + """Existing papers do not add URLs on a subsequent run.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Previous") + mailings = [ + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p1000r0", + "url": "https://example.com/p1000r0.pdf", + "filename": "p1000r0.pdf", + "title": "A paper", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ): + first = run_tracker_pipeline() + second = run_tracker_pipeline() + assert first.new_paper_count == 1 + assert second.new_paper_count == 0 + + +def test_tracker_pipeline_result_count(): + """TrackerPipelineResult.new_paper_count matches tuple length.""" + r = TrackerPipelineResult(new_paper_urls=("a", "b")) + assert r.new_paper_count == 2 + + +def test_run_tracker_pipeline_rejects_bad_from_mailing_date(): + """from_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid from_mailing_date"): + run_tracker_pipeline(from_mailing_date="not-valid") + + +def test_run_tracker_pipeline_rejects_bad_to_mailing_date(): + """to_mailing_date must look like YYYY-MM.""" + with pytest.raises(ValueError, match="Invalid to_mailing_date"): + run_tracker_pipeline(to_mailing_date="not-valid") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_rejects_from_after_to(): + with pytest.raises(ValueError, match="after"): + run_tracker_pipeline(from_mailing_date="2025-03", to_mailing_date="2025-01") + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_mailing_date_caps_inclusive_range(): + """With from and to, mailings outside [from, to] are skipped.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-03", title="Latest in DB") + mailings = [ + {"mailing_date": "2025-01", "title": "Too early", "year": "2025"}, + {"mailing_date": "2025-02", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-03", "title": "In range", "year": "2025"}, + {"mailing_date": "2025-04", "title": "Too late", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p2222r0", + "url": "https://example.com/p2222r0.pdf", + "filename": "p2222r0.pdf", + "title": "Feb", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline( + from_mailing_date="2025-02", to_mailing_date="2025-03" + ) + assert result.new_paper_count == 1 + assert fetch.call_count == 2 + + +@pytest.mark.django_db +def test_run_tracker_pipeline_to_only_caps_incremental_above_latest(): + """to_mailing_date without from: still require mailing_date > latest_in_db.""" + from wg21_paper_tracker.models import WG21Mailing + + WG21Mailing.objects.create(mailing_date="2025-01", title="Latest") + mailings = [ + {"mailing_date": "2025-01", "title": "Latest", "year": "2025"}, + {"mailing_date": "2025-02", "title": "New", "year": "2025"}, + {"mailing_date": "2025-03", "title": "Too new for cap", "year": "2025"}, + ] + papers = [ + { + "paper_id": "p3333r0", + "url": "https://example.com/p3333r0.pdf", + "filename": "p3333r0.pdf", + "title": "A", + "type": "pdf", + "authors": [], + "document_date": None, + "subgroup": "", + }, + ] + with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings): + with patch( + "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers + ) as fetch: + result = run_tracker_pipeline(to_mailing_date="2025-02") + assert result.new_paper_count == 1 + assert fetch.call_count == 1 From 818dcafb0367d61c18a89a6b3994f67a97f5d464 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 21 Mar 2026 05:07:26 -0700 Subject: [PATCH 17/20] Remove migration #24 --- .../0005_alter_slackuser_slack_user_id.py | 18 ------------------ ...005_wg21paperauthorprofile_author_alias.py} | 2 +- dev-24error: | 0 requirements.txt | 2 +- wg21_paper_tracker/migrations/0001_initial.py | 2 +- 5 files changed, 3 insertions(+), 21 deletions(-) delete mode 100644 cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py rename cppa_user_tracker/migrations/{0006_wg21paperauthorprofile_author_alias.py => 0005_wg21paperauthorprofile_author_alias.py} (84%) delete mode 100644 dev-24error: diff --git a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py b/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py deleted file mode 100644 index f1cde2c..0000000 --- a/cppa_user_tracker/migrations/0005_alter_slackuser_slack_user_id.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 4.2.28 on 2026-03-09 15:35 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('cppa_user_tracker', '0004_alter_slackuser_slack_user_id_and_more'), - ] - - operations = [ - migrations.AlterField( - model_name='slackuser', - name='slack_user_id', - field=models.CharField(max_length=64, unique=True), - ), - ] diff --git a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py similarity index 84% rename from cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py rename to cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py index 1660763..5623629 100644 --- a/cppa_user_tracker/migrations/0006_wg21paperauthorprofile_author_alias.py +++ b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"), ] operations = [ diff --git a/dev-24error: b/dev-24error: deleted file mode 100644 index e69de29..0000000 diff --git a/requirements.txt b/requirements.txt index 572af95..5e3ce95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,4 @@ PyGithub>=2.0 # cppa_pinecone_sync app pinecone>=3.0 langchain-core>=0.1 -langchain-text-splitters>=0.0.1 \ No newline at end of file +langchain-text-splitters>=0.0.1 diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py index a2bbf3d..9c6b4d6 100644 --- a/wg21_paper_tracker/migrations/0001_initial.py +++ b/wg21_paper_tracker/migrations/0001_initial.py @@ -9,7 +9,7 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ("cppa_user_tracker", "0005_alter_slackuser_slack_user_id"), + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), ] operations = [ From 4eb5c9b73bea24bd68ecadb1fdc7a5c9b68e0439 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 27 Mar 2026 09:55:21 -0700 Subject: [PATCH 18/20] wg21 paper updates, WG21 profile test fix, revert separate test DB URL #24 --- .github/workflows/actions.yml | 1 - config/test_settings.py | 10 +++----- cppa_user_tracker/services.py | 32 +++++++----------------- cppa_user_tracker/tests/test_services.py | 17 ++++++++++--- wg21_paper_tracker/pipeline.py | 12 ++++++--- wg21_paper_tracker/services.py | 8 +++++- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index d3f1723..a811078 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -89,7 +89,6 @@ jobs: - name: Test with pytest env: DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres - TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/postgres SECRET_KEY: for-testing-only DJANGO_SETTINGS_MODULE: config.test_settings run: | diff --git a/config/test_settings.py b/config/test_settings.py index 38aca75..b724fd9 100644 --- a/config/test_settings.py +++ b/config/test_settings.py @@ -7,14 +7,10 @@ from pathlib import Path from .settings import * # noqa: F401, F403 -from .settings import env -# Use SQLite in-memory for tests by default so no PostgreSQL is required. -# Set TEST_DATABASE_URL to run tests against PostgreSQL (e.g. in CI). -_test_db_url = os.environ.get("TEST_DATABASE_URL", "").strip() -if _test_db_url: - DATABASES = {"default": env.db("TEST_DATABASE_URL")} -else: +# Use SQLite in-memory for speed when DATABASE_URL not set (e.g. local pytest). +# CI can set DATABASE_URL=sqlite:///test.sqlite3 or leave unset for :memory: +if not os.environ.get("DATABASE_URL", "").strip(): DATABASES = { "default": { "ENGINE": "django.db.backends.sqlite3", diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 34da007..d35d23e 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -374,28 +374,14 @@ def get_or_create_wg21_paper_author_profile( ) ) - if not candidates: - profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) - if email_val: - add_email(profile, email_val, is_primary=True) - return profile, True - - if len(candidates) == 1: - profile = candidates[0] - if email_val and not profile.emails.filter(email=email_val).exists(): - add_email( - profile, - email_val, - is_primary=not profile.emails.filter(is_active=True).exists(), - ) - return profile, False - - # Two or more: disambiguate by email if provided + # Disambiguate by email if provided. + for p in candidates: + if email_val and p.emails.filter(email=email_val).exists(): + return p, False + elif not email_val and not p.emails.exists(): + return p, False + + profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) if email_val: - for p in candidates: - if p.emails.filter(email=email_val).exists(): - return p, False - profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val) add_email(profile, email_val, is_primary=True) - return profile, True - return candidates[0], False + return profile, True diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py index 75775ed..0c09e0c 100644 --- a/cppa_user_tracker/tests/test_services.py +++ b/cppa_user_tracker/tests/test_services.py @@ -609,16 +609,25 @@ def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it(): @pytest.mark.django_db -def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_adds_email(): - """Existing single match gets the supplied email attached.""" +def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_creates_new_profile(): + """One name match but email not on that profile: creates a new profile with the email. + + Disambiguation only returns an existing row when the email matches or when no email + is passed and the candidate has no emails; otherwise a new profile is created. + """ existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author") profile, created = services.get_or_create_wg21_paper_author_profile( display_name="Solo Author", email="solo@example.com", ) - assert created is False - assert profile.id == existing.id + assert created is True + assert profile.id != existing.id + assert profile.display_name == "Solo Author" assert profile.emails.filter(email="solo@example.com").exists() + assert ( + WG21PaperAuthorProfile.objects.filter(display_name="Solo Author").count() == 2 + ) + assert not existing.emails.filter(email="solo@example.com").exists() @pytest.mark.django_db diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py index ff15f50..a6bf41b 100644 --- a/wg21_paper_tracker/pipeline.py +++ b/wg21_paper_tracker/pipeline.py @@ -136,7 +136,9 @@ def _valid_paper_entries_for_id( return valid -def _choose_best_format_entry(valid_list: list[dict[str, Any]]) -> dict[str, Any]: +def _choose_best_format_entry( + valid_list: list[dict[str, Any]], +) -> dict[str, Any]: """Pick one row by format priority (adoc first). Precondition: valid_list non-empty.""" return min( valid_list, @@ -344,10 +346,12 @@ def run_tracker_pipeline( to_mailing_date=to_mailing_date, ) } + new_mailing_dates = set(m["mailing_date"] for m in new_mailings) for current_m in all_mailings: - if current_m["mailing_date"] in retry_dates and current_m[ - "mailing_date" - ] not in [x["mailing_date"] for x in new_mailings]: + if ( + current_m["mailing_date"] in retry_dates + and current_m["mailing_date"] not in new_mailing_dates + ): new_mailings.append(current_m) # Sort chronologically (oldest to newest) diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py index 24c7ba6..983493a 100644 --- a/wg21_paper_tracker/services.py +++ b/wg21_paper_tracker/services.py @@ -4,6 +4,7 @@ from __future__ import annotations +from datetime import date from typing import TYPE_CHECKING, Optional from django.db import IntegrityError, transaction @@ -40,7 +41,7 @@ def get_or_create_paper( paper_id: str, url: str, title: str, - document_date: Optional[str], + document_date: date | None, mailing: WG21Mailing, subgroup: str = "", author_names: Optional[list[str]] = None, @@ -139,6 +140,9 @@ def _update_paper(paper: WG21Paper) -> bool: created = False if author_names: + if not created: + for author in paper.authors.all(): + author.delete() emails = author_emails or [] for i, name in enumerate(author_names): email = emails[i] if i < len(emails) else None @@ -156,6 +160,8 @@ def get_or_create_paper_author( """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based). Updates author_order on existing link if it differs. """ + if not isinstance(author_order, int) or author_order <= 0: + raise ValueError("author_order must be a positive integer") link, link_created = WG21PaperAuthor.objects.get_or_create( paper=paper, profile=profile, From 57d9334990537f24e15ead0d49eab5e2322eb75a Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 20 Apr 2026 11:18:26 -0700 Subject: [PATCH 19/20] Fix: lint/format error --- cppa_user_tracker/services.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py index 2317586..f285323 100644 --- a/cppa_user_tracker/services.py +++ b/cppa_user_tracker/services.py @@ -386,6 +386,8 @@ def get_or_create_wg21_paper_author_profile( if email_val: add_email(profile, email_val, is_primary=True) return profile, True + + def get_or_create_youtube_speaker( external_id: str, display_name: str = "", From e122eb7b4a794ba5031b68d8b956dc3cd52dc6d6 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 20 Apr 2026 11:36:02 -0700 Subject: [PATCH 20/20] Fix: compose error --- ..._wg21_author_alias_youtubespeaker_external_id.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py diff --git a/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py new file mode 100644 index 0000000..fcdc4f2 --- /dev/null +++ b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py @@ -0,0 +1,13 @@ +# Merge parallel branches from 0004: WG21 author_alias vs YouTube speaker chain. + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"), + ("cppa_user_tracker", "0007_youtubespeaker_external_id"), + ] + + operations = []