diff --git a/config/settings.py b/config/settings.py
index 060aa0c2..cecb131d 100644
--- a/config/settings.py
+++ b/config/settings.py
@@ -67,6 +67,7 @@
"clang_github_tracker",
"cppa_slack_tracker",
"discord_activity_tracker",
+ "wg21_paper_tracker",
"cppa_youtube_script_tracker",
"slack_event_handler",
]
@@ -160,6 +161,7 @@
"cppa_slack_tracker",
"discord_activity_tracker",
"boost_mailing_list_tracker",
+ "wg21_paper_tracker",
"cppa_youtube_script_tracker",
"shared",
)
@@ -451,6 +453,16 @@ def _slack_team_scope_from_env():
)
).resolve()
+# WG21 Paper Tracker Configuration
+WG21_GITHUB_DISPATCH_ENABLED = env.bool("WG21_GITHUB_DISPATCH_ENABLED", default=False)
+WG21_GITHUB_DISPATCH_REPO = (env("WG21_GITHUB_DISPATCH_REPO", default="") or "").strip()
+WG21_GITHUB_DISPATCH_TOKEN = (
+ env("WG21_GITHUB_DISPATCH_TOKEN", default="") or ""
+).strip()
+WG21_GITHUB_DISPATCH_EVENT_TYPE = (
+ env("WG21_GITHUB_DISPATCH_EVENT_TYPE", default="wg21_papers_convert") or ""
+).strip() or "wg21_papers_convert"
+
# Logging - project-wide configuration for app commands (console + rotating file)
LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs")))
LOG_FILE = env("LOG_FILE", default="app.log")
diff --git a/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py
new file mode 100644
index 00000000..5623629c
--- /dev/null
+++ b/cppa_user_tracker/migrations/0005_wg21paperauthorprofile_author_alias.py
@@ -0,0 +1,19 @@
+# Generated by Django 4.2.28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="wg21paperauthorprofile",
+ name="author_alias",
+ field=models.CharField(blank=True, db_index=True, default="", max_length=255),
+ preserve_default=False,
+ ),
+ ]
diff --git a/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py
new file mode 100644
index 00000000..fcdc4f2b
--- /dev/null
+++ b/cppa_user_tracker/migrations/0008_merge_wg21_author_alias_youtubespeaker_external_id.py
@@ -0,0 +1,13 @@
+# Merge parallel branches from 0004: WG21 author_alias vs YouTube speaker chain.
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"),
+ ("cppa_user_tracker", "0007_youtubespeaker_external_id"),
+ ]
+
+ operations = []
diff --git a/cppa_user_tracker/models.py b/cppa_user_tracker/models.py
index 4e7afd31..7357c017 100644
--- a/cppa_user_tracker/models.py
+++ b/cppa_user_tracker/models.py
@@ -166,6 +166,7 @@ def save(self, *args, **kwargs):
super().save(*args, **kwargs)
display_name = models.CharField(max_length=255, db_index=True, blank=True)
+ author_alias = models.CharField(max_length=255, blank=True, db_index=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
diff --git a/cppa_user_tracker/services.py b/cppa_user_tracker/services.py
index d73bcf15..f2853230 100644
--- a/cppa_user_tracker/services.py
+++ b/cppa_user_tracker/services.py
@@ -26,6 +26,7 @@
MailingListProfile,
SlackUser,
DiscordProfile,
+ WG21PaperAuthorProfile,
YoutubeSpeaker,
)
@@ -248,7 +249,9 @@ def _get_next_negative_github_account_id() -> int:
@transaction.atomic
-def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool]:
+def get_or_create_slack_user(
+ user_data: dict[str, Any],
+) -> tuple[SlackUser, bool]:
"""Get or create a SlackUser from Slack API user data. Returns (SlackUser, created).
If the user exists, updates username, display_name, and avatar_url from user_data.
@@ -353,6 +356,38 @@ def get_or_create_discord_profile(
return profile, created
+def get_or_create_wg21_paper_author_profile(
+ display_name: str,
+ email: Optional[str] = None,
+) -> tuple[WG21PaperAuthorProfile, bool]:
+ """Get or create a WG21PaperAuthorProfile by display_name, with optional email disambiguation.
+
+ Finds all profiles with the given display_name. If none exist, creates one and adds
+ email if provided. If one exists, returns it. If multiple exist, and email is
+ provided, returns the one with that email if any; otherwise returns the first.
+ """
+ display_name_val = (display_name or "").strip()
+ email_val = (email or "").strip() or None
+
+ candidates = list(
+ WG21PaperAuthorProfile.objects.filter(display_name=display_name_val).order_by(
+ "id"
+ )
+ )
+
+ # Disambiguate by email if provided.
+ for p in candidates:
+ if email_val and p.emails.filter(email=email_val).exists():
+ return p, False
+ elif not email_val and not p.emails.exists():
+ return p, False
+
+ profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val)
+ if email_val:
+ add_email(profile, email_val, is_primary=True)
+ return profile, True
+
+
def get_or_create_youtube_speaker(
external_id: str,
display_name: str = "",
diff --git a/cppa_user_tracker/tests/test_services.py b/cppa_user_tracker/tests/test_services.py
index cf614811..0c09e0ce 100644
--- a/cppa_user_tracker/tests/test_services.py
+++ b/cppa_user_tracker/tests/test_services.py
@@ -8,6 +8,7 @@
GitHubAccountType,
Identity,
TempProfileIdentityRelation,
+ WG21PaperAuthorProfile,
)
from cppa_user_tracker import services
@@ -569,3 +570,103 @@ def test_get_or_create_mailing_list_profile_strips_display_name_and_email():
assert created is True
assert profile.display_name == "Trimmed"
assert profile.emails.filter(email="trimmed@example.com").exists()
+
+
+# --- get_or_create_wg21_paper_author_profile ---
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_no_candidates_creates():
+ """get_or_create_wg21_paper_author_profile creates new profile when none exist."""
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="New Author"
+ )
+ assert created is True
+ assert profile.display_name == "New Author"
+ assert WG21PaperAuthorProfile.objects.filter(display_name="New Author").count() == 1
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_no_candidates_with_email_adds_email():
+ """get_or_create_wg21_paper_author_profile adds email to new profile when provided."""
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Author With Email",
+ email="author@example.com",
+ )
+ assert created is True
+ assert profile.emails.filter(email="author@example.com").exists()
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it():
+ """get_or_create_wg21_paper_author_profile returns existing profile when exactly one matches."""
+ existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author")
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Solo Author"
+ )
+ assert created is False
+ assert profile.id == existing.id
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_creates_new_profile():
+ """One name match but email not on that profile: creates a new profile with the email.
+
+ Disambiguation only returns an existing row when the email matches or when no email
+ is passed and the candidate has no emails; otherwise a new profile is created.
+ """
+ existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author")
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Solo Author",
+ email="solo@example.com",
+ )
+ assert created is True
+ assert profile.id != existing.id
+ assert profile.display_name == "Solo Author"
+ assert profile.emails.filter(email="solo@example.com").exists()
+ assert (
+ WG21PaperAuthorProfile.objects.filter(display_name="Solo Author").count() == 2
+ )
+ assert not existing.emails.filter(email="solo@example.com").exists()
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first():
+ """get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email."""
+ first = WG21PaperAuthorProfile.objects.create(display_name="Dup Name")
+ _second = WG21PaperAuthorProfile.objects.create(display_name="Dup Name")
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Dup Name"
+ )
+ assert created is False
+ assert profile.id == first.id
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_second():
+ """get_or_create_wg21_paper_author_profile returns profile with matching email when multiple match."""
+ _first = WG21PaperAuthorProfile.objects.create(display_name="Same Name")
+ second = WG21PaperAuthorProfile.objects.create(display_name="Same Name")
+ services.add_email(second, "match@example.com", is_primary=True)
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Same Name",
+ email="match@example.com",
+ )
+ assert created is False
+ assert profile.id == second.id
+
+
+@pytest.mark.django_db
+def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_creates_new_profile():
+ """When multiple match and email matches none, a new profile is created with that email."""
+ first = WG21PaperAuthorProfile.objects.create(display_name="Other Name")
+ second = WG21PaperAuthorProfile.objects.create(display_name="Other Name")
+ services.add_email(second, "other@example.com", is_primary=True)
+ profile, created = services.get_or_create_wg21_paper_author_profile(
+ display_name="Other Name",
+ email="nomatch@example.com",
+ )
+ assert created is True
+ assert profile.id not in (first.id, second.id)
+ assert profile.display_name == "Other Name"
+ assert profile.emails.filter(email="nomatch@example.com").exists()
diff --git a/docs/Schema.md b/docs/Schema.md
index ec3ac250..5b012f62 100644
--- a/docs/Schema.md
+++ b/docs/Schema.md
@@ -69,6 +69,7 @@ erDiagram
WG21PaperAuthorProfile {
string display_name "IX"
+ string author_alias "IX"
datetime created_at
datetime updated_at
}
@@ -633,21 +634,35 @@ erDiagram
erDiagram
Direction LR
WG21PaperAuthorProfile ||--o{ WG21PaperAuthor : "author"
+ WG21Mailing ||--o{ WG21Paper : "has"
WG21PaperAuthor }o--|| WG21Paper : "has"
WG21PaperAuthor {
int id PK
int paper_id FK
int profile_id FK
+ int author_order
datetime created_at
}
+ WG21Mailing {
+ int id PK
+ string mailing_date UK "IX"
+ string title
+ datetime created_at
+ datetime updated_at
+ }
+
WG21Paper {
int id PK
- string paper_id UK "IX"
+ string paper_id "IX"
+ int year "IX"
string url
string title "IX"
- date publication_date "IX"
+ date document_date "IX"
+ int mailing_id FK "IX"
+ string subgroup "IX"
+ boolean is_downloaded "IX"
datetime created_at
datetime updated_at
}
@@ -655,7 +670,11 @@ erDiagram
**Note:** **WG21PaperAuthorProfile** extends `BaseProfile` (section 1). `profile_id` in WG21PaperAuthor references this profile; each paper can have multiple authors.
-**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor.
+**Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing.
+
+**Note:** **WG21Paper** is uniquely identified by the composite `(paper_id, year)`; `paper_id` is not globally unique. The same paper identifier may appear in different years (e.g. revisions).
+
+**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. `author_order` is optional and 1-based; it indicates the order of authors on the paper.
---
diff --git a/docs/operations/WG21_GitHub_Dispatch.md b/docs/operations/WG21_GitHub_Dispatch.md
new file mode 100644
index 00000000..49046b01
--- /dev/null
+++ b/docs/operations/WG21_GitHub_Dispatch.md
@@ -0,0 +1,69 @@
+# WG21 Paper Tracker → GitHub Actions (`repository_dispatch`)
+
+The Django app **`run_wg21_paper_tracker`** scrapes WG21 mailings and stores paper metadata in the database. It does **not** download PDFs or other documents. When **new** paper rows are created in a run, it can send **one** [repository dispatch](https://docs.github.com/en/rest/repos/repos#create-a-repository-dispatch-event) to another GitHub repository so a workflow there fetches each URL and runs conversion (e.g. PDF → Markdown).
+
+## Environment variables
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `WG21_GITHUB_DISPATCH_ENABLED` | No (default `false`) | Set to `true` to send `repository_dispatch` when there are new papers. |
+| `WG21_GITHUB_DISPATCH_REPO` | Yes, if enabled | Target repo as `owner/repo` (the repo whose workflow will run). |
+| `WG21_GITHUB_DISPATCH_TOKEN` | Yes, if enabled | PAT or token with permission to create repository dispatch events on that repo (classic PAT: `repo` scope for private repos). |
+| `WG21_GITHUB_DISPATCH_EVENT_TYPE` | No | Must match `on.repository_dispatch.types` in the target workflow. Default: `wg21_papers_convert`. |
+
+## `client_payload` contract
+
+The JSON body includes only a list of URL strings:
+
+```json
+{
+ "papers": [
+ "https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/…",
+ "https://www.open-std.org/…"
+ ]
+}
+```
+
+- **`papers`**: array of strings (WG21 document URLs), all new papers from **that** pipeline run in a **single** event.
+- There is **no** `new_paper_count` field; use `length(papers)` in the workflow if needed.
+
+## Target repository workflow (example)
+
+```yaml
+on:
+ repository_dispatch:
+ types: [wg21_papers_convert]
+
+jobs:
+ convert:
+ runs-on: ubuntu-latest
+ steps:
+ - name: URLs
+ run: |
+ echo '${{ toJson(github.event.client_payload.papers) }}'
+ # Fetch each URL, convert, store artifacts / upload elsewhere
+```
+
+In expressions, `github.event.client_payload.papers` is a JSON array of strings.
+
+## Token security
+
+Store `WG21_GITHUB_DISPATCH_TOKEN` in a secret manager or CI secret—never commit it. Prefer a fine-grained PAT scoped to the conversion repo if possible.
+
+## Payload size
+
+Very large mailings could produce many URLs in one payload. If you approach GitHub or runner limits, document a split strategy (multiple dispatches) as an edge case; the default is one dispatch per tracker run with the full list.
+
+## CLI options
+
+- **`--from-date YYYY-MM`**: Process mailings with `mailing_date >= YYYY-MM` (WG21 / CSV style). Backfills from that key onward when used alone.
+- **`--to-date YYYY-MM`**: Upper bound: `mailing_date <= YYYY-MM`. With `--from-date`, the run uses the inclusive range `[from, to]`. Without `--from-date`, behavior stays incremental (only mailings **newer than** the latest `WG21Mailing` in the DB), but capped at `to`—useful to avoid pulling very new mailings in a controlled run.
+- **`--dry-run`**: Log only; do not run the pipeline or send dispatch.
+
+## Flow summary
+
+1. Scheduler runs `run_wg21_paper_tracker` (optionally with `--from-date` / `--to-date`).
+2. Pipeline fetches mailings, upserts `WG21Mailing` / `WG21Paper` (metadata only).
+3. For each row **newly created** in that run, its document URL is collected.
+4. If the list is non-empty and dispatch is enabled, the app POSTs once to `POST /repos/{owner}/{repo}/dispatches` with `event_type` and `client_payload: { "papers": [ ... ] }`.
+5. The conversion repo’s workflow runs and downloads each URL.
diff --git a/docs/service_api/cppa_user_tracker.md b/docs/service_api/cppa_user_tracker.md
index f638501b..8f506423 100644
--- a/docs/service_api/cppa_user_tracker.md
+++ b/docs/service_api/cppa_user_tracker.md
@@ -41,6 +41,14 @@
---
+## WG21PaperAuthorProfile
+
+| Function | Parameter types | Return type | Description |
+| -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist and one matches the email, returns that profile. If multiple exist and no email is provided, returns the first. If multiple exist and the supplied email matches none, creates a new profile with that email. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. |
+
+---
+
## DiscordProfile
| Function | Parameter types | Return type | Description |
diff --git a/requirements.txt b/requirements.txt
index 34f77e77..cdd67211 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,6 +24,8 @@ slack-bolt>=1.18
pytz>=2024.1
selenium>=4.35
+# wg21_paper_tracker app
+beautifulsoup4>=4.12.0
# cppa_youtube_script_tracker app (YouTube Data API v3 + VTT transcript download)
google-api-python-client>=2.100
yt-dlp==2026.2.4
diff --git a/wg21_paper_tracker/__init__.py b/wg21_paper_tracker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/wg21_paper_tracker/admin.py b/wg21_paper_tracker/admin.py
new file mode 100644
index 00000000..bd57f4c2
--- /dev/null
+++ b/wg21_paper_tracker/admin.py
@@ -0,0 +1,41 @@
+from django.contrib import admin
+from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor
+
+
+@admin.register(WG21Mailing)
+class WG21MailingAdmin(admin.ModelAdmin):
+ list_display = ("mailing_date", "title", "created_at", "updated_at")
+ search_fields = ("mailing_date", "title")
+ ordering = ("-mailing_date",)
+
+
+class WG21PaperAuthorInline(admin.TabularInline):
+ model = WG21PaperAuthor
+ extra = 1
+ raw_id_fields = ("profile",)
+ ordering = ("author_order", "id")
+
+
+@admin.register(WG21Paper)
+class WG21PaperAdmin(admin.ModelAdmin):
+ list_display = (
+ "paper_id",
+ "year",
+ "title",
+ "document_date",
+ "mailing",
+ "subgroup",
+ "is_downloaded",
+ )
+ search_fields = ("paper_id", "title", "url", "subgroup")
+ list_filter = ("is_downloaded", "subgroup", "mailing", "year")
+ ordering = ("-document_date", "-paper_id")
+ inlines = [WG21PaperAuthorInline]
+
+
+@admin.register(WG21PaperAuthor)
+class WG21PaperAuthorAdmin(admin.ModelAdmin):
+ list_display = ("paper", "profile", "author_order", "created_at")
+ search_fields = ("paper__paper_id", "profile__display_name")
+ raw_id_fields = ("paper", "profile")
+ ordering = ("paper", "author_order", "id")
diff --git a/wg21_paper_tracker/apps.py b/wg21_paper_tracker/apps.py
new file mode 100644
index 00000000..d6f09d9b
--- /dev/null
+++ b/wg21_paper_tracker/apps.py
@@ -0,0 +1,7 @@
+from django.apps import AppConfig
+
+
+class Wg21PaperTrackerConfig(AppConfig):
+ default_auto_field = "django.db.models.BigAutoField"
+ name = "wg21_paper_tracker"
+ verbose_name = "WG21 Paper Tracker"
diff --git a/wg21_paper_tracker/fetcher.py b/wg21_paper_tracker/fetcher.py
new file mode 100644
index 00000000..05f6e98c
--- /dev/null
+++ b/wg21_paper_tracker/fetcher.py
@@ -0,0 +1,213 @@
+"""
+Fetcher for WG21 Papers.
+Scrapes the WG21 papers index and specific mailing tables.
+"""
+
+import re
+import urllib.parse
+from typing import Optional
+
+import requests
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://www.open-std.org/jtc1/sc22/wg21/docs/papers"
+
+_MAILING_ANCHOR_RE = re.compile(r"^mailing\d{4}-\d{2}$")
+# Paper link in first column: e.g. p1234r0.pdf, n4920.html, sd-9.md
+_PAPER_LINK_PATTERN = re.compile(r"((?:p\d+r\d+|n\d+|sd-\d+))\.([a-z]+)", re.IGNORECASE)
+
+
+def extract_paper_metadata_from_table_row(
+ cells: list[Tag],
+ page_url: str,
+) -> Optional[dict]:
+ """
+ Extract paper metadata from a WG21 mailing table row (td/th cells).
+
+ Current year pages (e.g. 2026) use eight columns::
+
+ WG21 Number | Title | Author | Document Date | Mailing Date |
+ Previous Version | Subgroup | Disposition
+
+ So **subgroup is index 6**, not 4. Index 4 is *mailing date* (string as shown on the site).
+
+ Older pages used a shorter row (five data columns); then subgroup was at index 4.
+ If ``len(cells) >= 8`` we use the 8-column layout; otherwise we keep the legacy mapping.
+ """
+ if not cells:
+ return None
+
+ first_cell = cells[0]
+ base = urllib.parse.urlparse(BASE_URL)
+
+ title = ""
+ if len(cells) > 1:
+ title = cells[1].text.strip()
+
+ authors: list[str] = []
+ if len(cells) > 2:
+ authors_raw = cells[2].text.strip()
+ if authors_raw:
+ authors = [
+ a.strip() for a in re.split(r",| and ", authors_raw) if a.strip()
+ ]
+
+ document_date = None
+ if len(cells) > 3:
+ date_str = cells[3].text.strip()
+ if date_str:
+ document_date = date_str
+
+ # 8+ columns: mailing date [4], previous version [5], subgroup [6], disposition [7]
+ subgroup = ""
+ if len(cells) >= 8:
+ subgroup = cells[6].text.strip()
+ elif len(cells) > 4:
+ subgroup = cells[4].text.strip()
+
+ for link in first_cell.find_all("a", href=True):
+ href = link.get("href", "")
+ match = _PAPER_LINK_PATTERN.search(href)
+ if not match:
+ continue
+
+ paper_url = urllib.parse.urljoin(page_url, href)
+ parsed = urllib.parse.urlparse(paper_url)
+ if parsed.scheme not in ("https", "http") or parsed.netloc != base.netloc:
+ logger.warning("Skipping off-origin paper URL %s", paper_url)
+ continue
+
+ paper_id = match.group(1).lower()
+ file_ext = match.group(2).lower()
+ filename = match.group(0).lower()
+
+ return {
+ "url": paper_url,
+ "filename": filename,
+ "type": file_ext,
+ "paper_id": paper_id,
+ "title": title,
+ "authors": authors,
+ "document_date": document_date,
+ "subgroup": subgroup,
+ }
+
+ return None
+
+
+def _find_table_in_section(anchor) -> Optional[Tag]:
+ """
+ Find the first
that belongs to the current mailing section.
+ Stops at the next mailing anchor (id/name matching mailingYYYY-MM) so we
+ do not attribute another mailing's table to this section.
+ """
+ if not anchor:
+ return None
+ anchor_id = anchor.get("id") or anchor.get("name") or ""
+ if not _MAILING_ANCHOR_RE.match(anchor_id):
+ return None
+ for elem in anchor.next_elements:
+ if not hasattr(elem, "name"): # NavigableString, etc.
+ continue
+ if elem is anchor:
+ continue
+ if elem.name == "table":
+ return elem
+ if not hasattr(elem, "get"): # e.g. NavigableString
+ continue
+ next_id = elem.get("id") or elem.get("name") or ""
+ if next_id and _MAILING_ANCHOR_RE.match(next_id) and next_id != anchor_id:
+ return None # next section start; no table in this section
+ return None
+
+
+def fetch_all_mailings() -> list[dict]:
+ """
+ Fetch the main index and extract all mailings.
+ Returns a list of dicts:
+ - mailing_date (e.g. '2025-02')
+ - title (e.g. '2025-02 pre-Hagenberg mailing')
+ - year (e.g. '2025')
+ List is in the order found on the page (usually newest first).
+ """
+ logger.info("Fetching WG21 main index: %s/", BASE_URL)
+ try:
+ response = requests.get(f"{BASE_URL}/", timeout=30)
+ response.raise_for_status()
+ except requests.RequestException:
+ logger.exception("Failed to fetch WG21 index.")
+ return []
+
+ # The mailings are listed in a markdown-like syntax or links
+ # Typically: 2025-02 pre-Hagenberg mailing
+ # Let's parse with BeautifulSoup
+ soup = BeautifulSoup(response.text, "html.parser")
+ mailings = []
+
+ # We look for links pointing to year/#mailingYYYY-MM
+ pattern = re.compile(r"^(\d{4})/#mailing(\d{4}-\d{2})$")
+
+ for a in soup.find_all("a", href=True):
+ href = a["href"]
+ match = pattern.search(href)
+ if match:
+ year, mailing_date = match.groups()
+ title = a.text.strip()
+ mailings.append(
+ {"mailing_date": mailing_date, "title": title, "year": year}
+ )
+
+ return mailings
+
+
+def fetch_papers_for_mailing(year: str, mailing_date: str) -> list[dict]:
+ """
+ Fetch the papers for a specific mailing from the year page.
+ Returns a list of paper dicts.
+ """
+ url = f"{BASE_URL}/{year}/"
+ logger.info("Fetching mailing %s from %s", mailing_date, url)
+ try:
+ response = requests.get(url, timeout=30)
+ response.raise_for_status()
+ except requests.RequestException:
+ logger.exception("Failed to fetch year page %s.", year)
+ return []
+
+ soup = BeautifulSoup(response.text, "html.parser")
+ anchor_id = f"mailing{mailing_date}"
+ anchor = soup.find(id=anchor_id) or soup.find(attrs={"name": anchor_id})
+ if not anchor:
+ logger.warning("Anchor %s not found on %s", anchor_id, url)
+ return []
+
+ table = _find_table_in_section(anchor)
+ if not table:
+ logger.warning("No table found after anchor %s", anchor_id)
+ return []
+
+ paper_urls = []
+
+ for row in table.find_all("tr"):
+ cells = row.find_all(["td", "th"])
+ if not cells or any(cell.get("colspan") for cell in cells):
+ continue
+
+ paper = extract_paper_metadata_from_table_row(cells, url)
+ if paper:
+ paper_urls.append(paper)
+
+ # Remove exact duplicates (same filename)
+ seen = set()
+ unique_papers = []
+ for p in paper_urls:
+ if p["filename"] not in seen:
+ seen.add(p["filename"])
+ unique_papers.append(p)
+
+ return unique_papers
diff --git a/wg21_paper_tracker/management/__init__.py b/wg21_paper_tracker/management/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/wg21_paper_tracker/management/commands/__init__.py b/wg21_paper_tracker/management/commands/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py
new file mode 100644
index 00000000..824617a8
--- /dev/null
+++ b/wg21_paper_tracker/management/commands/import_wg21_metadata_from_csv.py
@@ -0,0 +1,301 @@
+"""
+Management command: import_wg21_metadata_from_csv
+
+Reads workspace/wg21_paper_tracker/metadata.csv (or a given path) and fills
+WG21Mailing, WG21Paper, and WG21PaperAuthor using get_or_create_mailing and
+get_or_create_paper. Handles missing mailing_date via a placeholder mailing
+(unknown / Unknown).
+"""
+
+from __future__ import annotations
+
+import csv
+import logging
+import re
+from dataclasses import dataclass
+from datetime import date
+from pathlib import Path
+from typing import Optional
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db import IntegrityError
+from django.utils.dateparse import parse_date
+
+from wg21_paper_tracker.models import WG21Paper
+from wg21_paper_tracker.services import (
+ get_or_create_mailing,
+ get_or_create_paper,
+ get_or_create_paper_author,
+)
+from wg21_paper_tracker.workspace import get_workspace_root
+
+logger = logging.getLogger(__name__)
+
+MAILING_DATE_PATTERN = re.compile(r"^\d{4}-\d{2}$")
+TITLE_MAX_LENGTH = 1024
+PLACEHOLDER_MAILING_DATE = "unknown"
+PLACEHOLDER_MAILING_TITLE = "Unknown"
+
+
+def _norm(s: str) -> str:
+ """Return the string stripped of leading/trailing whitespace, or empty string if None."""
+ return (s or "").strip()
+
+
+def _normalize_title(raw: str) -> str:
+ """Replace internal newlines with space and truncate to model max_length."""
+ if not raw:
+ return ""
+ one_line = " ".join(raw.split())
+ return one_line[:TITLE_MAX_LENGTH] if len(one_line) > TITLE_MAX_LENGTH else one_line
+
+
+def _resolve_mailing_date(csv_mailing_date: str) -> tuple[str, str]:
+ """
+ Return (mailing_date, title) for this row.
+ If CSV mailing_date is non-empty and YYYY-MM, use it with synthetic title.
+ Otherwise use placeholder mailing_date="unknown", title="Unknown".
+ """
+ cleaned = _norm(csv_mailing_date)
+ if cleaned and MAILING_DATE_PATTERN.match(cleaned):
+ return cleaned, f"{cleaned} (from metadata)"
+ return PLACEHOLDER_MAILING_DATE, PLACEHOLDER_MAILING_TITLE
+
+
+def _parse_document_date(date_str: str):
+ """Return date or None from CSV date column (e.g. YYYY-MM-DD). Invalid values return None."""
+ cleaned = _norm(date_str)
+ if not cleaned:
+ return None
+ # try:
+ return parse_date(cleaned)
+ # except (ValueError, TypeError):
+ # return None
+
+
+def _author_names_from_csv(author_str: str) -> list[str]:
+ """Split author column by comma, strip each, drop empty."""
+ cleaned = _norm(author_str)
+ if not cleaned:
+ return []
+ return [a.strip() for a in cleaned.split(",") if a.strip()]
+
+
+def _read_csv_rows(csv_path: Path):
+ """Yield dicts for each row with normalized keys and values."""
+ with open(csv_path, newline="", encoding="utf-8") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ out = {}
+ for k, v in row.items():
+ if k is None:
+ continue
+ key = k.strip().lower()
+ out[key] = _norm(v) if v is not None else ""
+ # Normalize title (multi-line -> single line, truncate)
+ if "title" in out:
+ out["title"] = _normalize_title(out["title"])
+ yield out
+
+
+@dataclass(frozen=True)
+class _CsvImportRow:
+ paper_id: str
+ url: str
+ mailing_date: str
+ mailing_title: str
+ document_date: Optional[date]
+ year: Optional[int]
+ title: str
+ subgroup: str
+ author_names: list[str]
+
+
+def _parse_csv_import_row(row: dict) -> _CsvImportRow | None:
+ """Return parsed row, or None when paper_id or url is missing."""
+ paper_id = (row.get("paper_id", "") or "").strip().lower()
+ url = row.get("url", "")
+ if not paper_id or not url:
+ return None
+
+ mailing_date, mailing_title = _resolve_mailing_date(row.get("mailing_date", ""))
+ document_date = _parse_document_date(row.get("date", ""))
+ if mailing_date and MAILING_DATE_PATTERN.match(mailing_date):
+ year = int(mailing_date[:4])
+ elif document_date is not None:
+ year = document_date.year
+ else:
+ year = None
+ title = row.get("title", "") or paper_id
+ subgroup = row.get("subgroup", "")
+ author_names = _author_names_from_csv(row.get("author", ""))
+ return _CsvImportRow(
+ paper_id=paper_id,
+ url=url,
+ mailing_date=mailing_date,
+ mailing_title=mailing_title,
+ document_date=document_date,
+ year=year,
+ title=title,
+ subgroup=subgroup,
+ author_names=author_names,
+ )
+
+
+def _log_dry_run_row(parsed: _CsvImportRow) -> None:
+ logger.info(
+ "Would create/update paper %s -> mailing %r, document_date=%s, authors=%d",
+ parsed.paper_id,
+ parsed.mailing_date,
+ parsed.document_date,
+ len(parsed.author_names),
+ )
+
+
+def _attach_csv_authors_to_paper(paper: WG21Paper, author_names: list[str]) -> None:
+ from cppa_user_tracker.services import (
+ get_or_create_wg21_paper_author_profile,
+ )
+
+ for i, name in enumerate(author_names):
+ profile, _ = get_or_create_wg21_paper_author_profile(name)
+ get_or_create_paper_author(paper, profile, i + 1)
+
+
+def _update_paper_on_integrity_error(
+ parsed: _CsvImportRow, exc: IntegrityError, stats: dict
+) -> None:
+ mailing, _ = get_or_create_mailing(parsed.mailing_date, parsed.mailing_title)
+ try:
+ lookup_year = parsed.year if parsed.year is not None else 0
+ paper = WG21Paper.objects.filter(
+ paper_id=parsed.paper_id, year=lookup_year
+ ).first()
+ if paper is None:
+ stats["skipped"] += 1
+ logger.error("Error for paper_id=%s: %s", parsed.paper_id, exc)
+ return
+ paper.url = parsed.url
+ paper.title = parsed.title
+ paper.document_date = parsed.document_date
+ paper.mailing = mailing
+ paper.subgroup = parsed.subgroup
+ if parsed.year is not None:
+ paper.year = parsed.year
+ paper.save()
+ stats["papers_updated"] += 1
+ if parsed.author_names:
+ _attach_csv_authors_to_paper(paper, parsed.author_names)
+ except Exception:
+ stats["skipped"] += 1
+ logger.exception(
+ "Error for paper_id=%s (after IntegrityError).",
+ parsed.paper_id,
+ )
+
+
+def _upsert_paper_from_csv_row(parsed: _CsvImportRow, stats: dict) -> None:
+ try:
+ mailing, mailing_created = get_or_create_mailing(
+ parsed.mailing_date, parsed.mailing_title
+ )
+ if mailing_created:
+ stats["mailings_created"] += 1
+
+ _paper, paper_created = get_or_create_paper(
+ paper_id=parsed.paper_id,
+ url=parsed.url,
+ title=parsed.title,
+ document_date=parsed.document_date,
+ mailing=mailing,
+ subgroup=parsed.subgroup,
+ author_names=parsed.author_names if parsed.author_names else None,
+ year=parsed.year,
+ )
+ if paper_created:
+ stats["papers_created"] += 1
+ else:
+ stats["papers_updated"] += 1
+ except IntegrityError as e:
+ _update_paper_on_integrity_error(parsed, e, stats)
+ except Exception as e:
+ stats["skipped"] += 1
+ logger.error("Error for paper_id=%s: %s", parsed.paper_id, e)
+
+
+class Command(BaseCommand):
+ help = (
+ "Read metadata CSV and fill WG21Mailing and WG21Paper (and authors). "
+ "CSV columns: filename, paper_id, url, title, author, date, mailing_date, subgroup. "
+ "When mailing_date is empty, papers are linked to a single 'unknown' mailing."
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--csv-file",
+ type=Path,
+ default=None,
+ help="Path to metadata CSV (default: workspace/wg21_paper_tracker/metadata.csv)",
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Only read CSV and report what would be done; do not write to DB.",
+ )
+
+ def handle(self, *args, **options):
+ csv_path = options.get("csv_file") or (get_workspace_root() / "metadata.csv")
+ dry_run = options["dry_run"]
+
+ if not csv_path.exists():
+ raise CommandError(f"File not found: {csv_path}")
+
+ if dry_run:
+ logger.info("Dry run: no DB writes.")
+
+ stats = {
+ "rows": 0,
+ "skipped": 0,
+ "mailings_created": 0,
+ "papers_created": 0,
+ "papers_updated": 0,
+ }
+
+ for row in _read_csv_rows(csv_path):
+ stats["rows"] += 1
+ try:
+ parsed = _parse_csv_import_row(row)
+ except Exception as e:
+ stats["skipped"] += 1
+ paper_id = (row.get("paper_id", "") or "").strip().lower()
+ logger.error(
+ "Error parsing document date for paper_id=%s: %s",
+ paper_id,
+ e,
+ )
+ continue
+
+ if parsed is None:
+ stats["skipped"] += 1
+ if stats["skipped"] <= 5:
+ logger.debug(
+ "Skipping row: missing paper_id or url: %s",
+ row.get("paper_id", "") or row.get("url", "")[:50],
+ )
+ continue
+
+ if dry_run:
+ _log_dry_run_row(parsed)
+ continue
+
+ _upsert_paper_from_csv_row(parsed, stats)
+
+ logger.info(
+ "Rows processed: %d, skipped: %d, mailings created: %d, papers created: %d, papers updated: %d",
+ stats["rows"],
+ stats["skipped"],
+ stats["mailings_created"],
+ stats["papers_created"],
+ stats["papers_updated"],
+ )
+ logger.info("Done.")
diff --git a/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py
new file mode 100644
index 00000000..3f0965d8
--- /dev/null
+++ b/wg21_paper_tracker/management/commands/run_wg21_paper_tracker.py
@@ -0,0 +1,158 @@
+"""
+Management command for WG21 Paper Tracker.
+Runs the pipeline to fetch new mailings, upsert paper metadata in the DB, and optionally
+trigger a GitHub repository_dispatch so another repo can download and convert documents.
+"""
+
+import logging
+
+import requests
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+
+from wg21_paper_tracker.pipeline import run_tracker_pipeline
+
+logger = logging.getLogger(__name__)
+
+GITHUB_DISPATCH_URL = "https://api.github.com/repos/{repo}/dispatches"
+
+
+def trigger_github_repository_dispatch(
+ repo: str,
+ event_type: str,
+ token: str,
+ paper_urls: list[str],
+) -> None:
+ """POST repository_dispatch with client_payload {"papers": [, ...]}."""
+ url = GITHUB_DISPATCH_URL.format(repo=repo.strip())
+ headers = {
+ "Accept": "application/vnd.github+json",
+ "Authorization": f"Bearer {token.strip()}",
+ "X-GitHub-Api-Version": "2022-11-28",
+ }
+ body = {
+ "event_type": event_type,
+ "client_payload": {"papers": paper_urls},
+ }
+ logger.info(
+ "Sending repository_dispatch to %s (event_type=%s, %d URLs).",
+ repo,
+ event_type,
+ len(paper_urls),
+ )
+ response = requests.post(url, json=body, headers=headers, timeout=30)
+ if not response.ok:
+ logger.error(
+ "GitHub repository_dispatch failed: %s %s",
+ response.status_code,
+ response.text,
+ )
+ response.raise_for_status()
+
+
+class Command(BaseCommand):
+ """Run WG21 paper tracker and optionally trigger GitHub repository_dispatch."""
+
+ help = (
+ "Run WG21 paper tracker (scrape, DB update) and send new paper URLs via "
+ "repository_dispatch when enabled."
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Only log what would be done; do not run the pipeline or dispatch.",
+ )
+ parser.add_argument(
+ "--from-date",
+ dest="from_date",
+ metavar="YYYY-MM",
+ default=None,
+ help=(
+ "Process mailings with mailing_date >= YYYY-MM (WG21 / CSV style). "
+ "Backfills from that mailing onward; without --to-date, no upper cap."
+ ),
+ )
+ parser.add_argument(
+ "--to-date",
+ dest="to_date",
+ metavar="YYYY-MM",
+ default=None,
+ help=(
+ "Upper bound: mailing_date <= YYYY-MM. With --from-date, inclusive range; "
+ "without --from-date, still only mailings newer than DB latest (capped at to)."
+ ),
+ )
+
+ def handle(self, *args, **options):
+ dry_run = options.get("dry_run", False)
+ from_date = options.get("from_date")
+ to_date = options.get("to_date")
+ if from_date is not None:
+ from_date = from_date.strip()
+ if not from_date:
+ from_date = None
+ if to_date is not None:
+ to_date = to_date.strip()
+ if not to_date:
+ to_date = None
+ if dry_run:
+ if from_date or to_date:
+ logger.info(
+ "Dry run: skipping pipeline and GitHub dispatch "
+ "(from=%r, to=%r).",
+ from_date,
+ to_date,
+ )
+ else:
+ logger.info("Dry run: skipping pipeline and GitHub dispatch.")
+ return
+
+ logger.info("Starting WG21 Paper Tracker...")
+
+ try:
+ result = run_tracker_pipeline(
+ from_mailing_date=from_date,
+ to_mailing_date=to_date,
+ )
+ n = result.new_paper_count
+ logger.info("Recorded %d new paper(s); %d URL(s) for dispatch.", n, n)
+
+ if not n:
+ logger.info("No new papers in this run. Skipping GitHub dispatch.")
+ return
+
+ repo = getattr(settings, "WG21_GITHUB_DISPATCH_REPO", "") or ""
+ token = getattr(settings, "WG21_GITHUB_DISPATCH_TOKEN", "") or ""
+ enabled = getattr(settings, "WG21_GITHUB_DISPATCH_ENABLED", False)
+ event_type = getattr(
+ settings,
+ "WG21_GITHUB_DISPATCH_EVENT_TYPE",
+ "wg21_papers_convert",
+ )
+
+ if not enabled or not repo or not token:
+ logger.warning(
+ "Skipping GitHub dispatch: set WG21_GITHUB_DISPATCH_ENABLED=True "
+ "and configure WG21_GITHUB_DISPATCH_REPO and "
+ "WG21_GITHUB_DISPATCH_TOKEN."
+ )
+ return
+ try:
+ trigger_github_repository_dispatch(
+ repo,
+ event_type,
+ token,
+ list(result.new_paper_urls),
+ )
+ logger.info("repository_dispatch sent successfully.")
+ except Exception:
+ logger.exception("Failed to send repository_dispatch.")
+ raise
+
+ except ValueError as e:
+ raise CommandError(str(e)) from e
+ except Exception as e:
+ logger.exception("WG21 Paper Tracker failed: %s", e)
+ raise
diff --git a/wg21_paper_tracker/migrations/0001_initial.py b/wg21_paper_tracker/migrations/0001_initial.py
new file mode 100644
index 00000000..9c6b4d68
--- /dev/null
+++ b/wg21_paper_tracker/migrations/0001_initial.py
@@ -0,0 +1,133 @@
+# Merged initial migration: WG21 Mailing, WG21 Paper (year not null), WG21 Paper Author
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="WG21Mailing",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "mailing_date",
+ models.CharField(db_index=True, max_length=7, unique=True),
+ ),
+ ("title", models.CharField(max_length=255)),
+ ("created_at", models.DateTimeField(auto_now_add=True)),
+ ("updated_at", models.DateTimeField(auto_now=True)),
+ ],
+ options={
+ "verbose_name": "WG21 Mailing",
+ "verbose_name_plural": "WG21 Mailings",
+ "db_table": "wg21_paper_tracker_wg21mailing",
+ "ordering": ["-mailing_date"],
+ },
+ ),
+ migrations.CreateModel(
+ name="WG21Paper",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("paper_id", models.CharField(db_index=True, max_length=255)),
+ ("url", models.URLField(max_length=1024)),
+ ("title", models.CharField(db_index=True, max_length=1024)),
+ (
+ "document_date",
+ models.DateField(blank=True, db_index=True, null=True),
+ ),
+ ("year", models.IntegerField(db_index=True, default=0)),
+ (
+ "subgroup",
+ models.CharField(
+ blank=True, db_index=True, max_length=255
+ ),
+ ),
+ (
+ "is_downloaded",
+ models.BooleanField(db_index=True, default=False),
+ ),
+ ("created_at", models.DateTimeField(auto_now_add=True)),
+ ("updated_at", models.DateTimeField(auto_now=True)),
+ (
+ "mailing",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="papers",
+ to="wg21_paper_tracker.wg21mailing",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "WG21 Paper",
+ "verbose_name_plural": "WG21 Papers",
+ "db_table": "wg21_paper_tracker_wg21paper",
+ "ordering": ["-document_date", "-paper_id", "-year"],
+ "unique_together": {("paper_id", "year")},
+ },
+ ),
+ migrations.CreateModel(
+ name="WG21PaperAuthor",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("author_order", models.PositiveIntegerField(blank=True, null=True)),
+ ("created_at", models.DateTimeField(auto_now_add=True)),
+ (
+ "paper",
+ models.ForeignKey(
+ db_column="paper_id",
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="authors",
+ to="wg21_paper_tracker.wg21paper",
+ ),
+ ),
+ (
+ "profile",
+ models.ForeignKey(
+ db_column="profile_id",
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="papers",
+ to="cppa_user_tracker.wg21paperauthorprofile",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "WG21 Paper Author",
+ "verbose_name_plural": "WG21 Paper Authors",
+ "db_table": "wg21_paper_tracker_wg21paperauthor",
+ "ordering": ["id"],
+ "unique_together": {("paper", "profile")},
+ },
+ ),
+ ]
diff --git a/wg21_paper_tracker/migrations/__init__.py b/wg21_paper_tracker/migrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/wg21_paper_tracker/models.py b/wg21_paper_tracker/models.py
new file mode 100644
index 00000000..fede57ba
--- /dev/null
+++ b/wg21_paper_tracker/models.py
@@ -0,0 +1,79 @@
+"""
+Models per docs/Schema.md section 7: WG21 Papers Tracker.
+References cppa_user_tracker.WG21PaperAuthorProfile (section 1) as author.
+"""
+
+from django.db import models
+
+
+class WG21Mailing(models.Model):
+ """WG21 mailing release (mailing_date, title)."""
+
+ mailing_date = models.CharField(max_length=7, unique=True, db_index=True)
+ title = models.CharField(max_length=255)
+ created_at = models.DateTimeField(auto_now_add=True)
+ updated_at = models.DateTimeField(auto_now=True)
+
+ class Meta:
+ ordering = ["-mailing_date"]
+ verbose_name = "WG21 Mailing"
+ verbose_name_plural = "WG21 Mailings"
+
+ def __str__(self):
+ return f"{self.mailing_date} ({self.title})"
+
+
+class WG21Paper(models.Model):
+ """WG21 paper (paper_id, url, title, document_date, year, mailing, subgroup, is_downloaded)."""
+
+ paper_id = models.CharField(max_length=255, db_index=True)
+ url = models.URLField(max_length=1024)
+ title = models.CharField(max_length=1024, db_index=True)
+ document_date = models.DateField(db_index=True, null=True, blank=True)
+ year = models.IntegerField(default=0, db_index=True)
+ mailing = models.ForeignKey(
+ WG21Mailing,
+ on_delete=models.CASCADE,
+ related_name="papers",
+ )
+ subgroup = models.CharField(max_length=255, blank=True, db_index=True)
+ is_downloaded = models.BooleanField(default=False, db_index=True)
+ created_at = models.DateTimeField(auto_now_add=True)
+ updated_at = models.DateTimeField(auto_now=True)
+
+ class Meta:
+ unique_together = [["paper_id", "year"]]
+ ordering = ["-document_date", "-paper_id", "-year"]
+ verbose_name = "WG21 Paper"
+ verbose_name_plural = "WG21 Papers"
+
+ def __str__(self):
+ return f"{self.paper_id}: {self.title[:60]}"
+
+
+class WG21PaperAuthor(models.Model):
+ """Paper-author link (paper_id, profile_id->WG21PaperAuthorProfile)."""
+
+ paper = models.ForeignKey(
+ WG21Paper,
+ on_delete=models.CASCADE,
+ related_name="authors",
+ db_column="paper_id",
+ )
+ profile = models.ForeignKey(
+ "cppa_user_tracker.WG21PaperAuthorProfile",
+ on_delete=models.CASCADE,
+ related_name="papers",
+ db_column="profile_id",
+ )
+ author_order = models.PositiveIntegerField(null=True, blank=True)
+ created_at = models.DateTimeField(auto_now_add=True)
+
+ class Meta:
+ unique_together = (("paper", "profile"),)
+ ordering = ["id"]
+ verbose_name = "WG21 Paper Author"
+ verbose_name_plural = "WG21 Paper Authors"
+
+ def __str__(self):
+ return f"{self.paper.paper_id} - {self.profile.display_name}"
diff --git a/wg21_paper_tracker/pipeline.py b/wg21_paper_tracker/pipeline.py
new file mode 100644
index 00000000..a6bf41b7
--- /dev/null
+++ b/wg21_paper_tracker/pipeline.py
@@ -0,0 +1,370 @@
+"""
+Pipeline for WG21 Paper Tracker.
+Coordinates scraping and updating the database (metadata only; no file download or GCS).
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import date, datetime
+from typing import Any, Optional
+
+from django.utils.dateparse import parse_date
+
+from wg21_paper_tracker.fetcher import (
+ fetch_all_mailings,
+ fetch_papers_for_mailing,
+)
+from wg21_paper_tracker.models import WG21Mailing
+from wg21_paper_tracker.services import (
+ get_or_create_mailing,
+ get_or_create_paper,
+)
+
+logger = logging.getLogger(__name__)
+
+# WG21 mailing_date and typical CSV column (e.g. 2025-03, 2026-01)
+_MAILING_DATE_LABEL_RE = re.compile(r"^\d{4}-\d{2}$")
+
+
+def _normalize_mailing_date_label(label: str, *, field_name: str) -> str:
+ s = label.strip()
+ if not _MAILING_DATE_LABEL_RE.match(s):
+ raise ValueError(
+ f"Invalid {field_name} {label!r}; "
+ "expected YYYY-MM (e.g. 2025-03), same as WG21 / CSV mailing keys."
+ )
+ return s
+
+
+def _mailing_date_in_run_scope(
+ mailing_date: str,
+ *,
+ latest_date: str,
+ from_mailing_date: Optional[str],
+ to_mailing_date: Optional[str],
+) -> bool:
+ """Whether a mailing key is selected for this run (before retry merge)."""
+ if from_mailing_date is None and to_mailing_date is None:
+ return mailing_date > latest_date
+
+ if from_mailing_date is not None and mailing_date < from_mailing_date:
+ return False
+ if to_mailing_date is not None and mailing_date > to_mailing_date:
+ return False
+ if from_mailing_date is None and to_mailing_date is not None:
+ return mailing_date > latest_date
+ return True
+
+
+def _format_priority(ext: str) -> int:
+ """Prefer adoc > html > ps > pdf when multiple formats exist for one paper_id."""
+ priorities = {"adoc": 1, "html": 2, "ps": 3, "pdf": 4}
+ return priorities.get(ext.lower(), 100)
+
+
+def _parse_mailing_year(m_info: dict) -> int:
+ """Return 4-digit year from the index mailing dict, or 0 if missing/invalid."""
+ mailing_date = m_info["mailing_date"]
+ year_raw = m_info.get("year")
+ if not year_raw or not str(year_raw).strip():
+ logger.warning(
+ "Mailing %s: year missing or empty, using 0 (fix later).",
+ mailing_date,
+ )
+ return 0
+ try:
+ year = int(str(year_raw).strip()[:4])
+ except (ValueError, TypeError):
+ logger.warning(
+ "Mailing %s: year not parseable %r, using 0 (fix later).",
+ mailing_date,
+ year_raw,
+ )
+ return 0
+ if year <= 0 or year > datetime.now().year + 1:
+ logger.warning(
+ "Mailing %s: year invalid, using 0 (fix later).",
+ mailing_date,
+ )
+ return 0
+ return year
+
+
+def _group_fetched_papers_by_id(
+ papers: list[dict[str, Any]], mailing_date: str
+) -> dict[str, list[dict[str, Any]]]:
+ """Bucket fetcher rows by normalized paper_id."""
+ papers_by_id: dict[str, list[dict[str, Any]]] = {}
+ for p in papers:
+ pid = (p.get("paper_id") or "").strip().lower()
+ if not pid:
+ logger.warning(
+ "Skipping paper entry without a paper_id in mailing %s: %r",
+ mailing_date,
+ p,
+ )
+ continue
+ papers_by_id.setdefault(pid, []).append(p)
+ return papers_by_id
+
+
+def _valid_paper_entries_for_id(
+ p_list: list[dict[str, Any]], pid: str, mailing_date: str
+) -> list[dict[str, Any]]:
+ """Keep rows that have type, url, and title (all non-empty)."""
+ valid: list[dict[str, Any]] = []
+ for p in p_list:
+ type_val = (
+ (p.get("type") or "").strip() if isinstance(p.get("type"), str) else ""
+ )
+ url_val = (p.get("url") or "").strip() if isinstance(p.get("url"), str) else ""
+ title_val = (
+ (p.get("title") or "").strip() if isinstance(p.get("title"), str) else ""
+ )
+ if not type_val or not url_val or not title_val:
+ logger.debug(
+ "Skipping malformed paper entry for %s in mailing %s: %r",
+ pid,
+ mailing_date,
+ p,
+ )
+ continue
+ valid.append(p)
+ return valid
+
+
+def _choose_best_format_entry(
+ valid_list: list[dict[str, Any]],
+) -> dict[str, Any]:
+ """Pick one row by format priority (adoc first). Precondition: valid_list non-empty."""
+ return min(
+ valid_list,
+ key=lambda x: _format_priority(str(x.get("type") or "").strip()),
+ )
+
+
+def _parse_scraped_document_date(doc_date_str: Any) -> Optional[date]:
+ if not doc_date_str:
+ return None
+ try:
+ return parse_date(str(doc_date_str).strip())
+ except Exception as e:
+ logger.warning(
+ "Failed to parse document date: %s: %s",
+ doc_date_str,
+ e,
+ )
+ return None
+
+
+def _upsert_paper_from_scraped_row(
+ pid: str,
+ best_paper: dict[str, Any],
+ mailing_obj: WG21Mailing,
+ year: int,
+ mailing_date: str,
+) -> Optional[str]:
+ """
+ Create or update WG21Paper from the chosen fetcher row.
+ Returns the document URL if a **new** row was inserted, else None.
+ """
+ url = (best_paper.get("url") or "").strip()
+ paper_title = (best_paper.get("title") or "").strip()
+ subgroup = (best_paper.get("subgroup") or "").strip()
+ authors = best_paper.get("authors")
+ if not isinstance(authors, list):
+ authors = []
+ if not url or not paper_title:
+ logger.warning(
+ "Skipping paper %s in mailing %s due to missing required fields: %r",
+ pid,
+ mailing_date,
+ best_paper,
+ )
+ return None
+
+ doc_date = _parse_scraped_document_date(best_paper.get("document_date"))
+ _paper_obj, created = get_or_create_paper(
+ paper_id=pid,
+ url=url,
+ title=paper_title,
+ document_date=doc_date,
+ mailing=mailing_obj,
+ subgroup=subgroup,
+ author_names=authors,
+ year=year,
+ )
+ return url if created else None
+
+
+def _process_single_mailing(m_info: dict) -> list[str]:
+ """
+ For one mailing from the index: normalize year, get/create WG21Mailing,
+ fetch paper rows from the site, upsert WG21Paper rows.
+
+ Returns URLs for papers **newly created** in this run for this mailing.
+ """
+ mailing_date = m_info["mailing_date"]
+ title = m_info["title"]
+ year = _parse_mailing_year(m_info)
+ mailing_obj, _ = get_or_create_mailing(mailing_date, title)
+
+ papers = fetch_papers_for_mailing(str(year), mailing_date)
+ if not papers:
+ logger.info(
+ "Mailing %s: no papers found (anchor/table may be missing).",
+ mailing_date,
+ )
+ return []
+
+ papers_by_id = _group_fetched_papers_by_id(papers, mailing_date)
+ new_urls: list[str] = []
+
+ for pid, p_list in papers_by_id.items():
+ valid_list = _valid_paper_entries_for_id(p_list, pid, mailing_date)
+ if not valid_list:
+ logger.warning(
+ "Skipping paper %s in mailing %s: no valid entries (type, url, title)",
+ pid,
+ mailing_date,
+ )
+ continue
+ best_paper = _choose_best_format_entry(valid_list)
+ url = _upsert_paper_from_scraped_row(
+ pid, best_paper, mailing_obj, year, mailing_date
+ )
+ if url:
+ new_urls.append(url)
+
+ return new_urls
+
+
+@dataclass(frozen=True)
+class TrackerPipelineResult:
+ """Result of run_tracker_pipeline: URLs for papers newly created in this run."""
+
+ new_paper_urls: tuple[str, ...] = field(default_factory=tuple)
+
+ @property
+ def new_paper_count(self) -> int:
+ return len(self.new_paper_urls)
+
+
+def run_tracker_pipeline(
+ *,
+ from_mailing_date: Optional[str] = None,
+ to_mailing_date: Optional[str] = None,
+) -> TrackerPipelineResult:
+ """
+ Run the WG21 tracker pipeline: scrape mailings, upsert papers in the DB.
+ Returns URLs for rows created in this run (for GitHub repository_dispatch).
+
+ Mailing keys are ``YYYY-MM`` (WG21 / CSV style). Selection:
+
+ - Neither ``from_mailing_date`` nor ``to_mailing_date``: process mailings with
+ ``mailing_date`` strictly newer than the latest ``WG21Mailing`` in the DB.
+ - ``from_mailing_date`` only: ``mailing_date >= from_mailing_date``.
+ - ``to_mailing_date`` only: ``mailing_date > latest_in_db`` and
+ ``mailing_date <= to_mailing_date`` (incremental runs capped at ``to``).
+ - Both: ``from_mailing_date <= mailing_date <= to_mailing_date`` (inclusive).
+
+ ``from_mailing_date`` must not be lexicographically after ``to_mailing_date``.
+ """
+ if from_mailing_date is not None:
+ from_mailing_date = _normalize_mailing_date_label(
+ from_mailing_date, field_name="from_mailing_date"
+ )
+ if to_mailing_date is not None:
+ to_mailing_date = _normalize_mailing_date_label(
+ to_mailing_date, field_name="to_mailing_date"
+ )
+ if (
+ from_mailing_date is not None
+ and to_mailing_date is not None
+ and from_mailing_date > to_mailing_date
+ ):
+ raise ValueError(
+ f"from_mailing_date {from_mailing_date!r} is after "
+ f"to_mailing_date {to_mailing_date!r}."
+ )
+
+ # 1. Get latest mailing from DB
+ latest_mailing = (
+ WG21Mailing.objects.exclude(mailing_date="unknown")
+ .order_by("-mailing_date")
+ .first()
+ )
+ latest_date = latest_mailing.mailing_date if latest_mailing else "1970-01"
+
+ # 2. Fetch all mailings
+ all_mailings = fetch_all_mailings()
+ if not all_mailings:
+ logger.warning("No mailings found on WG21 site.")
+ return TrackerPipelineResult()
+
+ # Filter mailings to process
+ new_mailings = [
+ m
+ for m in all_mailings
+ if _mailing_date_in_run_scope(
+ m["mailing_date"],
+ latest_date=latest_date,
+ from_mailing_date=from_mailing_date,
+ to_mailing_date=to_mailing_date,
+ )
+ ]
+ if from_mailing_date is None and to_mailing_date is None:
+ baseline_desc = f"latest_in_db={latest_date}"
+ else:
+ parts: list[str] = []
+ if from_mailing_date is not None:
+ parts.append(f"from={from_mailing_date}")
+ if to_mailing_date is not None:
+ parts.append(f"to={to_mailing_date}")
+ if from_mailing_date is None:
+ parts.append(f"latest_in_db={latest_date}")
+ baseline_desc = ", ".join(parts)
+
+ # Requeue incomplete mailings so transient failures get retried (not just the latest)
+ retry_dates = set(
+ WG21Mailing.objects.filter(papers__isnull=True).values_list(
+ "mailing_date", flat=True
+ )
+ )
+ if latest_mailing:
+ retry_dates.add(latest_mailing.mailing_date)
+ retry_dates = {
+ d
+ for d in retry_dates
+ if _mailing_date_in_run_scope(
+ d,
+ latest_date=latest_date,
+ from_mailing_date=from_mailing_date,
+ to_mailing_date=to_mailing_date,
+ )
+ }
+ new_mailing_dates = set(m["mailing_date"] for m in new_mailings)
+ for current_m in all_mailings:
+ if (
+ current_m["mailing_date"] in retry_dates
+ and current_m["mailing_date"] not in new_mailing_dates
+ ):
+ new_mailings.append(current_m)
+
+ # Sort chronologically (oldest to newest)
+ new_mailings.sort(key=lambda x: x["mailing_date"])
+
+ logger.info(
+ "Pipeline: %s, all_mailings=%d, mailings_to_process=%s",
+ baseline_desc,
+ len(all_mailings),
+ [m["mailing_date"] for m in new_mailings],
+ )
+ new_urls: list[str] = []
+ for m_info in new_mailings:
+ new_urls.extend(_process_single_mailing(m_info))
+
+ return TrackerPipelineResult(new_paper_urls=tuple(new_urls))
diff --git a/wg21_paper_tracker/services.py b/wg21_paper_tracker/services.py
new file mode 100644
index 00000000..983493a7
--- /dev/null
+++ b/wg21_paper_tracker/services.py
@@ -0,0 +1,186 @@
+"""
+Database logic for WG21 Paper Tracker.
+"""
+
+from __future__ import annotations
+
+from datetime import date
+from typing import TYPE_CHECKING, Optional
+
+from django.db import IntegrityError, transaction
+
+from cppa_user_tracker.services import get_or_create_wg21_paper_author_profile
+from wg21_paper_tracker.models import WG21Mailing, WG21Paper, WG21PaperAuthor
+
+if TYPE_CHECKING:
+ from cppa_user_tracker.models import WG21PaperAuthorProfile
+
+
+def _normalize_year(year: int | str | None) -> int:
+ """Return a 4-digit year as int, or 0 if missing/invalid."""
+ if year is None:
+ return 0
+ if isinstance(year, int):
+ return year if 0 < year <= 9999 else 0
+ s = str(year).strip()[:4]
+ return int(s) if s.isdigit() else 0
+
+
+@transaction.atomic
+def get_or_create_mailing(mailing_date: str, title: str) -> tuple[WG21Mailing, bool]:
+ mailing, created = WG21Mailing.objects.get_or_create(
+ mailing_date=mailing_date, defaults={"title": title}
+ )
+ if not created and mailing.title != title:
+ mailing.title = title
+ mailing.save(update_fields=["title", "updated_at"])
+ return mailing, created
+
+
+def get_or_create_paper(
+ paper_id: str,
+ url: str,
+ title: str,
+ document_date: date | None,
+ mailing: WG21Mailing,
+ subgroup: str = "",
+ author_names: Optional[list[str]] = None,
+ author_emails: Optional[list[str]] = None,
+ year: int | None = None,
+) -> tuple[WG21Paper, bool]:
+ paper_id = (paper_id or "").strip().lower()
+ if not paper_id:
+ raise ValueError("paper_id is required")
+ year_val = _normalize_year(year)
+
+ def _update_paper(paper: WG21Paper) -> bool:
+ updated = False
+ if paper.url != url:
+ paper.url = url
+ updated = True
+ if paper.title != title:
+ paper.title = title
+ updated = True
+ if paper.document_date != document_date:
+ paper.document_date = document_date
+ updated = True
+ if paper.mailing_id != mailing.id:
+ paper.mailing = mailing
+ updated = True
+ if paper.subgroup != subgroup:
+ paper.subgroup = subgroup
+ updated = True
+ if paper.year != year_val:
+ paper.year = year_val
+ updated = True
+ if updated:
+ paper.save()
+ return updated
+
+ try:
+ with transaction.atomic():
+ if year_val > 0:
+ # Prefer exact (paper_id, year); else promote placeholder (paper_id, 0) to real year
+ paper = WG21Paper.objects.filter(
+ paper_id=paper_id, year=year_val
+ ).first()
+ if paper:
+ _update_paper(paper)
+ created = False
+ else:
+ placeholder = WG21Paper.objects.filter(
+ paper_id=paper_id, year=0
+ ).first()
+ if placeholder:
+ try:
+ placeholder.url = url
+ placeholder.title = title
+ placeholder.document_date = document_date
+ placeholder.mailing = mailing
+ placeholder.subgroup = subgroup
+ placeholder.year = year_val
+ placeholder.save()
+ paper = placeholder
+ created = False
+ except IntegrityError:
+ raise # Roll back this transaction; recovery runs below
+ else:
+ paper, created = WG21Paper.objects.get_or_create(
+ paper_id=paper_id,
+ year=year_val,
+ defaults={
+ "url": url,
+ "title": title,
+ "document_date": document_date,
+ "mailing": mailing,
+ "subgroup": subgroup,
+ },
+ )
+ else:
+ paper, created = WG21Paper.objects.get_or_create(
+ paper_id=paper_id,
+ year=0,
+ defaults={
+ "url": url,
+ "title": title,
+ "document_date": document_date,
+ "mailing": mailing,
+ "subgroup": subgroup,
+ },
+ )
+ if not created:
+ _update_paper(paper)
+ except IntegrityError:
+ # Placeholder promotion hit (paper_id, year_val) unique constraint; fetch and update canonical row
+ with transaction.atomic():
+ paper = WG21Paper.objects.filter(paper_id=paper_id, year=year_val).first()
+ if not paper:
+ raise
+ _update_paper(paper)
+ created = False
+
+ if author_names:
+ if not created:
+ for author in paper.authors.all():
+ author.delete()
+ emails = author_emails or []
+ for i, name in enumerate(author_names):
+ email = emails[i] if i < len(emails) else None
+ profile, _ = get_or_create_wg21_paper_author_profile(name, email=email)
+ get_or_create_paper_author(paper, profile, i + 1)
+
+ return paper, created
+
+
+def get_or_create_paper_author(
+ paper: WG21Paper,
+ profile: WG21PaperAuthorProfile,
+ author_order: int,
+) -> tuple[WG21PaperAuthor, bool]:
+ """Get or create a WG21PaperAuthor link for (paper, profile), with author_order (1-based).
+ Updates author_order on existing link if it differs.
+ """
+ if not isinstance(author_order, int) or author_order <= 0:
+ raise ValueError("author_order must be a positive integer")
+ link, link_created = WG21PaperAuthor.objects.get_or_create(
+ paper=paper,
+ profile=profile,
+ defaults={"author_order": author_order},
+ )
+ if not link_created and link.author_order != author_order:
+ link.author_order = author_order
+ link.save(update_fields=["author_order"])
+ return link, link_created
+
+
+def mark_paper_downloaded(paper_id: str, year: int | None = None):
+ paper_id = (paper_id or "").strip().lower()
+ if not paper_id:
+ raise ValueError("paper_id is required")
+ if year is None:
+ raise ValueError("year is required; pass 0 explicitly for placeholder papers")
+ year_val = _normalize_year(year)
+ WG21Paper.objects.filter(
+ paper_id=paper_id,
+ year=year_val,
+ ).update(is_downloaded=True)
diff --git a/wg21_paper_tracker/tests/__init__.py b/wg21_paper_tracker/tests/__init__.py
new file mode 100644
index 00000000..18e481d7
--- /dev/null
+++ b/wg21_paper_tracker/tests/__init__.py
@@ -0,0 +1 @@
+# Tests for wg21_paper_tracker app (excluding cloud_run_job).
diff --git a/wg21_paper_tracker/tests/test_commands.py b/wg21_paper_tracker/tests/test_commands.py
new file mode 100644
index 00000000..a099ccb2
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_commands.py
@@ -0,0 +1,138 @@
+"""Tests for wg21_paper_tracker management commands."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from django.core.management import call_command
+from django.core.management.base import CommandError
+from django.test.utils import override_settings
+
+from wg21_paper_tracker.pipeline import TrackerPipelineResult
+
+
+CMD_NAME = "import_wg21_metadata_from_csv"
+RUN_TRACKER_CMD = "run_wg21_paper_tracker"
+
+
+def test_import_wg21_metadata_from_csv_raises_when_csv_missing(tmp_path):
+ """Command raises CommandError when CSV file does not exist."""
+ csv_path = tmp_path / "nonexistent.csv"
+ assert not csv_path.exists()
+
+ with pytest.raises(CommandError, match=r"File not found:"):
+ call_command(CMD_NAME, f"--csv-file={csv_path}")
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_posts_dispatch_when_enabled():
+ """run_wg21_paper_tracker sends repository_dispatch with papers URL list."""
+ mock_resp = MagicMock()
+ mock_resp.ok = True
+ mock_resp.status_code = 204
+ mock_resp.text = ""
+
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline",
+ return_value=TrackerPipelineResult(
+ new_paper_urls=("https://open-std.org/a.pdf", "https://open-std.org/b.pdf")
+ ),
+ ):
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post",
+ return_value=mock_resp,
+ ) as m_post:
+ with override_settings(
+ WG21_GITHUB_DISPATCH_ENABLED=True,
+ WG21_GITHUB_DISPATCH_REPO="myorg/convert-repo",
+ WG21_GITHUB_DISPATCH_TOKEN="secret-token",
+ WG21_GITHUB_DISPATCH_EVENT_TYPE="wg21_papers_convert",
+ ):
+ call_command(RUN_TRACKER_CMD)
+
+ m_post.assert_called_once()
+ assert m_post.call_args[0][0] == (
+ "https://api.github.com/repos/myorg/convert-repo/dispatches"
+ )
+ body = m_post.call_args[1]["json"]
+ assert body["event_type"] == "wg21_papers_convert"
+ assert body["client_payload"] == {
+ "papers": [
+ "https://open-std.org/a.pdf",
+ "https://open-std.org/b.pdf",
+ ],
+ }
+ headers = m_post.call_args[1]["headers"]
+ assert headers["Authorization"] == "Bearer secret-token"
+ assert headers["Accept"] == "application/vnd.github+json"
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_skips_post_when_no_new_papers():
+ """No HTTP request when pipeline returns no new URLs."""
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline",
+ return_value=TrackerPipelineResult(),
+ ):
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post",
+ ) as m_post:
+ with override_settings(
+ WG21_GITHUB_DISPATCH_ENABLED=True,
+ WG21_GITHUB_DISPATCH_REPO="o/r",
+ WG21_GITHUB_DISPATCH_TOKEN="t",
+ ):
+ call_command(RUN_TRACKER_CMD)
+ m_post.assert_not_called()
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_skips_post_when_dispatch_disabled():
+ """No HTTP request when WG21_GITHUB_DISPATCH_ENABLED is False."""
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline",
+ return_value=TrackerPipelineResult(new_paper_urls=("https://x/y",)),
+ ):
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.requests.post",
+ ) as m_post:
+ with override_settings(
+ WG21_GITHUB_DISPATCH_ENABLED=False,
+ WG21_GITHUB_DISPATCH_REPO="o/r",
+ WG21_GITHUB_DISPATCH_TOKEN="t",
+ ):
+ call_command(RUN_TRACKER_CMD)
+ m_post.assert_not_called()
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_rejects_invalid_from_date():
+ """--from-date must be YYYY-MM."""
+ with pytest.raises(CommandError, match="Invalid from_mailing_date"):
+ call_command(RUN_TRACKER_CMD, "--from-date=bad")
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_passes_from_date_to_pipeline():
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline",
+ return_value=TrackerPipelineResult(),
+ ) as m:
+ call_command(RUN_TRACKER_CMD, "--from-date=2025-03")
+ m.assert_called_once_with(from_mailing_date="2025-03", to_mailing_date=None)
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_rejects_invalid_to_date():
+ with pytest.raises(CommandError, match="Invalid to_mailing_date"):
+ call_command(RUN_TRACKER_CMD, "--to-date=bad")
+
+
+@pytest.mark.django_db
+def test_run_wg21_paper_tracker_passes_from_and_to_date_to_pipeline():
+ with patch(
+ "wg21_paper_tracker.management.commands.run_wg21_paper_tracker.run_tracker_pipeline",
+ return_value=TrackerPipelineResult(),
+ ) as m:
+ call_command(RUN_TRACKER_CMD, "--from-date=2025-01", "--to-date=2025-03")
+ m.assert_called_once_with(from_mailing_date="2025-01", to_mailing_date="2025-03")
diff --git a/wg21_paper_tracker/tests/test_fetcher.py b/wg21_paper_tracker/tests/test_fetcher.py
new file mode 100644
index 00000000..93b21588
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_fetcher.py
@@ -0,0 +1,274 @@
+"""Tests for wg21_paper_tracker.fetcher."""
+
+from unittest.mock import patch, MagicMock
+
+import requests
+from bs4 import BeautifulSoup
+
+from wg21_paper_tracker.fetcher import (
+ BASE_URL,
+ extract_paper_metadata_from_table_row,
+ fetch_all_mailings,
+ fetch_papers_for_mailing,
+)
+
+
+# --- fetch_all_mailings ---
+
+
+def test_fetch_all_mailings_returns_empty_on_request_failure():
+ """fetch_all_mailings returns [] when requests.get raises RequestException."""
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ m.side_effect = requests.RequestException("network error")
+ result = fetch_all_mailings()
+ assert result == []
+
+
+def test_fetch_all_mailings_returns_empty_on_http_error():
+ """fetch_all_mailings returns [] when response.raise_for_status raises HTTPError."""
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.raise_for_status.side_effect = requests.HTTPError("404")
+ m.return_value = resp
+ result = fetch_all_mailings()
+ assert result == []
+
+
+def test_fetch_all_mailings_parses_links():
+ """fetch_all_mailings parses year/#mailingYYYY-MM links and returns mailings."""
+ html = """
+
+ 2025-01 pre-meeting mailing
+ 2025-02 post-meeting mailing
+ 2024-11 mailing
+ Ignore
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_all_mailings()
+ assert len(result) == 3
+ assert result[0]["mailing_date"] == "2025-01"
+ assert result[0]["title"] == "2025-01 pre-meeting mailing"
+ assert result[0]["year"] == "2025"
+ assert result[1]["mailing_date"] == "2025-02"
+ assert result[2]["mailing_date"] == "2024-11"
+ assert result[2]["year"] == "2024"
+
+
+def test_fetch_all_mailings_calls_index_url():
+ """fetch_all_mailings calls BASE_URL/ with timeout."""
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ m.return_value = MagicMock(text="", raise_for_status=MagicMock())
+ fetch_all_mailings()
+ m.assert_called_once_with(f"{BASE_URL}/", timeout=30)
+
+
+# --- fetch_papers_for_mailing ---
+
+
+def test_fetch_papers_for_mailing_returns_empty_on_request_failure():
+ """fetch_papers_for_mailing returns [] when requests.get raises RequestException."""
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ m.side_effect = requests.RequestException("timeout")
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert result == []
+
+
+def test_fetch_papers_for_mailing_returns_empty_when_anchor_missing():
+ """fetch_papers_for_mailing returns [] when mailing anchor is not found."""
+ html = "x
"
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert result == []
+
+
+def test_fetch_papers_for_mailing_finds_anchor_by_id():
+ """fetch_papers_for_mailing finds anchor by id=mailingYYYY-MM."""
+ html = """
+
+
+
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert len(result) == 1
+ assert result[0]["paper_id"] == "p1000r0"
+ assert result[0]["filename"] == "p1000r0.pdf"
+ assert result[0]["title"] == "Title"
+ assert result[0]["authors"] == ["Author"]
+ assert result[0]["document_date"] == "2025-01-15"
+ assert result[0]["subgroup"] == "SG1"
+
+
+def test_fetch_papers_for_mailing_finds_anchor_by_name():
+ """fetch_papers_for_mailing finds anchor by name= when id is missing."""
+ html = """
+
+
+
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert len(result) == 1
+ assert result[0]["paper_id"] == "n5034"
+ assert result[0]["type"] == "html"
+
+
+def test_fetch_papers_for_mailing_normalizes_paper_id_lowercase():
+ """fetch_papers_for_mailing returns paper_id in lowercase."""
+ html = """
+
+
+
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert result[0]["paper_id"] == "p3039r1"
+ assert result[0]["filename"] == "p3039r1.pdf"
+
+
+def test_fetch_papers_for_mailing_returns_empty_when_no_table():
+ """fetch_papers_for_mailing returns [] when no table follows anchor."""
+ html = """
+
+
+ No table here
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ result = fetch_papers_for_mailing("2025", "2025-01")
+ assert result == []
+
+
+def test_fetch_papers_for_mailing_does_not_use_next_mailings_table():
+ """First mailing with no table returns []; second mailing's table is not used."""
+ html = """
+
+ 2025-02
+ No papers this month.
+ 2025-01
+
+
+ """
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ resp = MagicMock()
+ resp.text = html
+ resp.raise_for_status = MagicMock()
+ m.return_value = resp
+ first = fetch_papers_for_mailing("2025", "2025-02")
+ second = fetch_papers_for_mailing("2025", "2025-01")
+ assert first == [], "2025-02 has no table; must not attribute 2025-01's table"
+ assert len(second) == 1
+ assert second[0]["paper_id"] == "p1234r1"
+
+
+def test_fetch_papers_for_mailing_calls_year_url():
+ """fetch_papers_for_mailing calls BASE_URL/{year}/ with timeout."""
+ with patch("wg21_paper_tracker.fetcher.requests.get") as m:
+ m.return_value = MagicMock(
+ text="",
+ raise_for_status=MagicMock(),
+ )
+ fetch_papers_for_mailing("2025", "2025-01")
+ m.assert_called_once_with(f"{BASE_URL}/2025/", timeout=30)
+
+
+# --- extract_paper_metadata_from_table_row ---
+
+
+def test_extract_paper_metadata_from_table_row_returns_none_when_no_cells():
+ """Empty cell list yields no paper."""
+ assert extract_paper_metadata_from_table_row([], f"{BASE_URL}/2025/") is None
+
+
+def test_extract_paper_metadata_from_table_row_returns_none_when_no_paper_link():
+ """Row without a matching paper href returns None."""
+ html = "| No link here | t |
"
+ row = BeautifulSoup(html, "html.parser").find("tr")
+ cells = row.find_all(["td", "th"])
+ assert extract_paper_metadata_from_table_row(cells, f"{BASE_URL}/2025/") is None
+
+
+def test_extract_paper_metadata_from_table_row_parses_legacy_five_column_row():
+ """Older tables: Number, Title, Author, Document date, Subgroup (subgroup at index 4)."""
+ html = """
+
+ | P1234R0 |
+ My title |
+ Author One, Author Two |
+ 2025-03-15 |
+ LEWG |
+
+ """
+ row = BeautifulSoup(html, "html.parser").find("tr")
+ cells = row.find_all(["td", "th"])
+ page_url = f"{BASE_URL}/2025/"
+ result = extract_paper_metadata_from_table_row(cells, page_url)
+ assert result is not None
+ assert result["paper_id"] == "p1234r0"
+ assert result["type"] == "pdf"
+ assert result["filename"] == "p1234r0.pdf"
+ assert result["url"] == f"{BASE_URL}/2025/p1234r0.pdf"
+ assert result["title"] == "My title"
+ assert result["authors"] == ["Author One", "Author Two"]
+ assert result["document_date"] == "2025-03-15"
+ assert result["subgroup"] == "LEWG"
+
+
+def test_extract_paper_metadata_from_table_row_parses_eight_column_row():
+ """2026+ style: subgroup is column 7 (index 6), not index 4 (mailing date)."""
+ html = """
+
+ | P1000R7 |
+ C++ IS Schedule (proposed) |
+ Herb Sutter |
+ 2026-01-13 |
+ 2026-01 |
+ P1000R6 |
+ All of WG21 |
+ |
+
+ """
+ row = BeautifulSoup(html, "html.parser").find("tr")
+ cells = row.find_all(["td", "th"])
+ page_url = f"{BASE_URL}/2026/"
+ result = extract_paper_metadata_from_table_row(cells, page_url)
+ assert result is not None
+ assert result["paper_id"] == "p1000r7"
+ assert result["document_date"] == "2026-01-13"
+ assert result["subgroup"] == "All of WG21"
diff --git a/wg21_paper_tracker/tests/test_models.py b/wg21_paper_tracker/tests/test_models.py
new file mode 100644
index 00000000..9b4ee7e8
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_models.py
@@ -0,0 +1,86 @@
+"""Tests for wg21_paper_tracker.models."""
+
+from datetime import date
+
+import pytest
+from django.db import IntegrityError, transaction
+
+from wg21_paper_tracker.models import WG21Mailing, WG21Paper
+
+
+@pytest.mark.django_db
+def test_wg21_mailing_str():
+ """WG21Mailing.__str__ returns mailing_date and title."""
+ m = WG21Mailing.objects.create(mailing_date="2025-01", title="2025-01 pre-meeting")
+ assert str(m) == "2025-01 (2025-01 pre-meeting)"
+
+
+@pytest.mark.django_db
+def test_wg21_paper_str():
+ """WG21Paper.__str__ returns paper_id and truncated title."""
+ m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title")
+ p = WG21Paper.objects.create(
+ paper_id="p1000r0",
+ url="https://example.com/p.pdf",
+ title="A short title",
+ document_date=date(2025, 1, 15),
+ mailing=m,
+ year=2025,
+ )
+ assert "p1000r0" in str(p)
+ assert "A short title" in str(p)
+
+
+@pytest.mark.django_db
+def test_wg21_paper_str_truncates_long_title():
+ """WG21Paper.__str__ truncates title to 60 chars."""
+ m = WG21Mailing.objects.create(mailing_date="2025-01", title="Title")
+ long_title = "x" * 100
+ p = WG21Paper.objects.create(
+ paper_id="p1",
+ url="https://example.com/p.pdf",
+ title=long_title,
+ mailing=m,
+ year=2025,
+ )
+ assert len(str(p).split(": ", 1)[-1]) <= 60
+
+
+@pytest.mark.django_db
+def test_wg21_mailing_ordering():
+ """WG21Mailing default ordering is by mailing_date descending."""
+ WG21Mailing.objects.create(mailing_date="2025-01", title="A")
+ WG21Mailing.objects.create(mailing_date="2025-02", title="B")
+ dates = list(WG21Mailing.objects.values_list("mailing_date", flat=True))
+ assert dates == ["2025-02", "2025-01"]
+
+
+@pytest.mark.django_db
+def test_wg21_paper_unique_together_paper_id_year():
+ """WG21Paper allows same paper_id with different year; rejects duplicate (paper_id, year)."""
+ m1 = WG21Mailing.objects.create(mailing_date="2024-11", title="M1")
+ m2 = WG21Mailing.objects.create(mailing_date="2025-01", title="M2")
+ WG21Paper.objects.create(
+ paper_id="sd-1",
+ url="https://example.com/1.pdf",
+ title="T1",
+ mailing=m1,
+ year=2024,
+ )
+ with pytest.raises(IntegrityError):
+ with transaction.atomic():
+ WG21Paper.objects.create(
+ paper_id="sd-1",
+ url="https://example.com/dup.pdf",
+ title="T1 dup",
+ mailing=m1,
+ year=2024,
+ )
+ p2 = WG21Paper.objects.create(
+ paper_id="sd-1",
+ url="https://example.com/2.pdf",
+ title="T2",
+ mailing=m2,
+ year=2025,
+ )
+ assert p2.pk is not None
diff --git a/wg21_paper_tracker/tests/test_pipeline.py b/wg21_paper_tracker/tests/test_pipeline.py
new file mode 100644
index 00000000..099f105b
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_pipeline.py
@@ -0,0 +1,221 @@
+"""Tests for wg21_paper_tracker.pipeline."""
+
+from unittest.mock import patch
+
+import pytest
+
+from wg21_paper_tracker.pipeline import TrackerPipelineResult, run_tracker_pipeline
+
+
+# --- run_tracker_pipeline ---
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_returns_empty_when_no_mailings():
+ """run_tracker_pipeline returns empty result when fetch_all_mailings returns []."""
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=[]):
+ result = run_tracker_pipeline()
+ assert result.new_paper_count == 0
+ assert result.new_paper_urls == ()
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_skips_when_no_new_mailings():
+ """run_tracker_pipeline returns empty when all mailings are <= latest in DB."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-02", title="Latest")
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings") as m:
+ m.return_value = [
+ {"mailing_date": "2025-01", "title": "Old", "year": "2025"},
+ {"mailing_date": "2025-02", "title": "Latest", "year": "2025"},
+ ]
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=[]
+ ):
+ result = run_tracker_pipeline()
+ assert result.new_paper_count == 0
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_collects_urls_for_new_papers():
+ """run_tracker_pipeline returns URLs for papers created in this run."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-01", title="Previous")
+ mailings = [
+ {"mailing_date": "2025-01", "title": "Previous", "year": "2025"},
+ {"mailing_date": "2025-02", "title": "New", "year": "2025"},
+ ]
+ papers = [
+ {
+ "paper_id": "p1000r0",
+ "url": "https://example.com/p1000r0.pdf",
+ "filename": "p1000r0.pdf",
+ "title": "A paper",
+ "type": "pdf",
+ "authors": [],
+ "document_date": None,
+ "subgroup": "",
+ },
+ ]
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings):
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers
+ ):
+ result = run_tracker_pipeline()
+ assert result.new_paper_count == 1
+ assert result.new_paper_urls == ("https://example.com/p1000r0.pdf",)
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_from_mailing_date_backfills_older_than_db_latest():
+ """from_mailing_date includes mailings >= date even when DB latest is newer."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-02", title="Latest in DB")
+ mailings = [
+ {"mailing_date": "2025-01", "title": "Older", "year": "2025"},
+ {"mailing_date": "2025-02", "title": "Latest in DB", "year": "2025"},
+ ]
+ papers = [
+ {
+ "paper_id": "p1111r0",
+ "url": "https://example.com/p1111r0.pdf",
+ "filename": "p1111r0.pdf",
+ "title": "January paper",
+ "type": "pdf",
+ "authors": [],
+ "document_date": None,
+ "subgroup": "",
+ },
+ ]
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings):
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers
+ ):
+ result = run_tracker_pipeline(from_mailing_date="2025-01")
+ assert result.new_paper_count == 1
+ assert result.new_paper_urls == ("https://example.com/p1111r0.pdf",)
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_second_run_no_new_urls():
+ """Existing papers do not add URLs on a subsequent run."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-01", title="Previous")
+ mailings = [
+ {"mailing_date": "2025-02", "title": "New", "year": "2025"},
+ ]
+ papers = [
+ {
+ "paper_id": "p1000r0",
+ "url": "https://example.com/p1000r0.pdf",
+ "filename": "p1000r0.pdf",
+ "title": "A paper",
+ "type": "pdf",
+ "authors": [],
+ "document_date": None,
+ "subgroup": "",
+ },
+ ]
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings):
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers
+ ):
+ first = run_tracker_pipeline()
+ second = run_tracker_pipeline()
+ assert first.new_paper_count == 1
+ assert second.new_paper_count == 0
+
+
+def test_tracker_pipeline_result_count():
+ """TrackerPipelineResult.new_paper_count matches tuple length."""
+ r = TrackerPipelineResult(new_paper_urls=("a", "b"))
+ assert r.new_paper_count == 2
+
+
+def test_run_tracker_pipeline_rejects_bad_from_mailing_date():
+ """from_mailing_date must look like YYYY-MM."""
+ with pytest.raises(ValueError, match="Invalid from_mailing_date"):
+ run_tracker_pipeline(from_mailing_date="not-valid")
+
+
+def test_run_tracker_pipeline_rejects_bad_to_mailing_date():
+ """to_mailing_date must look like YYYY-MM."""
+ with pytest.raises(ValueError, match="Invalid to_mailing_date"):
+ run_tracker_pipeline(to_mailing_date="not-valid")
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_rejects_from_after_to():
+ with pytest.raises(ValueError, match="after"):
+ run_tracker_pipeline(from_mailing_date="2025-03", to_mailing_date="2025-01")
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_to_mailing_date_caps_inclusive_range():
+ """With from and to, mailings outside [from, to] are skipped."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-03", title="Latest in DB")
+ mailings = [
+ {"mailing_date": "2025-01", "title": "Too early", "year": "2025"},
+ {"mailing_date": "2025-02", "title": "In range", "year": "2025"},
+ {"mailing_date": "2025-03", "title": "In range", "year": "2025"},
+ {"mailing_date": "2025-04", "title": "Too late", "year": "2025"},
+ ]
+ papers = [
+ {
+ "paper_id": "p2222r0",
+ "url": "https://example.com/p2222r0.pdf",
+ "filename": "p2222r0.pdf",
+ "title": "Feb",
+ "type": "pdf",
+ "authors": [],
+ "document_date": None,
+ "subgroup": "",
+ },
+ ]
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings):
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers
+ ) as fetch:
+ result = run_tracker_pipeline(
+ from_mailing_date="2025-02", to_mailing_date="2025-03"
+ )
+ assert result.new_paper_count == 1
+ assert fetch.call_count == 2
+
+
+@pytest.mark.django_db
+def test_run_tracker_pipeline_to_only_caps_incremental_above_latest():
+ """to_mailing_date without from: still require mailing_date > latest_in_db."""
+ from wg21_paper_tracker.models import WG21Mailing
+
+ WG21Mailing.objects.create(mailing_date="2025-01", title="Latest")
+ mailings = [
+ {"mailing_date": "2025-01", "title": "Latest", "year": "2025"},
+ {"mailing_date": "2025-02", "title": "New", "year": "2025"},
+ {"mailing_date": "2025-03", "title": "Too new for cap", "year": "2025"},
+ ]
+ papers = [
+ {
+ "paper_id": "p3333r0",
+ "url": "https://example.com/p3333r0.pdf",
+ "filename": "p3333r0.pdf",
+ "title": "A",
+ "type": "pdf",
+ "authors": [],
+ "document_date": None,
+ "subgroup": "",
+ },
+ ]
+ with patch("wg21_paper_tracker.pipeline.fetch_all_mailings", return_value=mailings):
+ with patch(
+ "wg21_paper_tracker.pipeline.fetch_papers_for_mailing", return_value=papers
+ ) as fetch:
+ result = run_tracker_pipeline(to_mailing_date="2025-02")
+ assert result.new_paper_count == 1
+ assert fetch.call_count == 1
diff --git a/wg21_paper_tracker/tests/test_services.py b/wg21_paper_tracker/tests/test_services.py
new file mode 100644
index 00000000..bd3a3e29
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_services.py
@@ -0,0 +1,260 @@
+"""Tests for wg21_paper_tracker.services."""
+
+from datetime import date
+from unittest.mock import patch
+
+import pytest
+
+from wg21_paper_tracker.services import (
+ get_or_create_mailing,
+ get_or_create_paper,
+ mark_paper_downloaded,
+)
+
+
+# --- get_or_create_mailing ---
+
+
+@pytest.mark.django_db
+def test_get_or_create_mailing_creates_new():
+ """get_or_create_mailing creates new mailing and returns (mailing, True)."""
+ m, created = get_or_create_mailing("2025-01", "2025-01 pre-meeting mailing")
+ assert created is True
+ assert m.mailing_date == "2025-01"
+ assert m.title == "2025-01 pre-meeting mailing"
+
+
+@pytest.mark.django_db
+def test_get_or_create_mailing_gets_existing():
+ """get_or_create_mailing returns existing mailing and (mailing, False)."""
+ get_or_create_mailing("2025-01", "Original title")
+ m2, created2 = get_or_create_mailing("2025-01", "Updated title")
+ assert created2 is False
+ assert m2.mailing_date == "2025-01"
+ assert m2.title == "Updated title" # title is updated when different
+
+
+@pytest.mark.django_db
+def test_get_or_create_mailing_updates_title_when_different():
+ """get_or_create_mailing updates title when existing has different title."""
+ get_or_create_mailing("2025-02", "Old title")
+ m, _ = get_or_create_mailing("2025-02", "New title")
+ m.refresh_from_db()
+ assert m.title == "New title"
+
+
+# --- get_or_create_paper ---
+
+
+@pytest.mark.django_db
+@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile")
+def test_get_or_create_paper_creates_new(mock_profile, db):
+ """get_or_create_paper creates new paper and returns (paper, True)."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, created = get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/p1000r0.pdf",
+ title="A paper",
+ document_date=date(2025, 1, 15),
+ mailing=mailing,
+ subgroup="SG1",
+ author_names=None,
+ year=2025,
+ )
+ assert created is True
+ assert paper.paper_id == "p1000r0"
+ assert paper.title == "A paper"
+ assert paper.year == 2025
+ assert paper.mailing_id == mailing.id
+ assert paper.subgroup == "SG1"
+ mock_profile.assert_not_called()
+
+
+@pytest.mark.django_db
+@patch("wg21_paper_tracker.services.get_or_create_wg21_paper_author_profile")
+@patch("wg21_paper_tracker.services.get_or_create_paper_author")
+def test_get_or_create_paper_calls_author_profile_for_each_author(
+ mock_get_or_create_paper_author, mock_profile, db
+):
+ """get_or_create_paper calls get_or_create_wg21_paper_author_profile and get_or_create_paper_author for each author."""
+ from unittest.mock import MagicMock
+
+ alice_profile = MagicMock()
+ alice_profile.pk = 1
+ bob_profile = MagicMock()
+ bob_profile.pk = 2
+ mock_profile.side_effect = [
+ (alice_profile, True),
+ (bob_profile, True),
+ ]
+ mock_get_or_create_paper_author.return_value = (MagicMock(), True)
+
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, created = get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/p1000r0.pdf",
+ title="A paper",
+ document_date=None,
+ mailing=mailing,
+ author_names=["Alice", "Bob"],
+ year=2025,
+ )
+ assert created is True
+ assert mock_profile.call_count == 2
+ mock_profile.assert_any_call("Alice", email=None)
+ mock_profile.assert_any_call("Bob", email=None)
+ assert mock_get_or_create_paper_author.call_count == 2
+ mock_get_or_create_paper_author.assert_any_call(paper, alice_profile, 1)
+ mock_get_or_create_paper_author.assert_any_call(paper, bob_profile, 2)
+
+
+@pytest.mark.django_db
+def test_get_or_create_paper_normalizes_paper_id_lowercase(db):
+ """get_or_create_paper stores paper_id in lowercase."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, _ = get_or_create_paper(
+ paper_id=" P3039R1 ",
+ url="https://example.com/p3039r1.pdf",
+ title="T",
+ document_date=None,
+ mailing=mailing,
+ year=2025,
+ )
+ assert paper.paper_id == "p3039r1"
+
+
+@pytest.mark.django_db
+def test_get_or_create_paper_gets_existing_and_updates(db):
+ """get_or_create_paper returns existing and updates fields when different."""
+ mailing1, _ = get_or_create_mailing("2025-01", "M1")
+ mailing2, _ = get_or_create_mailing("2025-02", "M2")
+ get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/old.pdf",
+ title="Old title",
+ document_date=date(2025, 1, 1),
+ mailing=mailing1,
+ subgroup="SG1",
+ year=2025,
+ )
+ paper2, created2 = get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/new.pdf",
+ title="New title",
+ document_date=date(2025, 2, 1),
+ mailing=mailing2,
+ subgroup="SG2",
+ year=2025,
+ )
+ assert created2 is False
+ paper2.refresh_from_db()
+ assert paper2.url == "https://example.com/new.pdf"
+ assert paper2.title == "New title"
+ assert paper2.mailing_id == mailing2.id
+ assert paper2.subgroup == "SG2"
+
+
+@pytest.mark.django_db
+def test_get_or_create_paper_year_none_stored_as_zero(db):
+ """get_or_create_paper with year=None stores 0 so records can be updated later."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, _ = get_or_create_paper(
+ paper_id="n5034",
+ url="https://example.com/n5034.html",
+ title="Draft",
+ document_date=None,
+ mailing=mailing,
+ year=None,
+ )
+ assert paper.year == 0
+
+
+@pytest.mark.django_db
+def test_get_or_create_paper_same_paper_id_different_year_creates_two(db):
+ """get_or_create_paper creates separate rows for same paper_id different year (unique_together)."""
+ mailing1, _ = get_or_create_mailing("2024-11", "M1")
+ mailing2, _ = get_or_create_mailing("2025-01", "M2")
+ p1, c1 = get_or_create_paper(
+ paper_id="sd-1",
+ url="https://example.com/sd-1-2024.pdf",
+ title="SD 2024",
+ document_date=None,
+ mailing=mailing1,
+ year=2024,
+ )
+ p2, c2 = get_or_create_paper(
+ paper_id="sd-1",
+ url="https://example.com/sd-1-2025.pdf",
+ title="SD 2025",
+ document_date=None,
+ mailing=mailing2,
+ year=2025,
+ )
+ assert c1 is True and c2 is True
+ assert p1.pk != p2.pk
+ assert p1.year == 2024 and p2.year == 2025
+
+
+@pytest.mark.django_db
+def test_get_or_create_paper_sets_author_order(db):
+ """get_or_create_paper sets author_order (1-based) on WG21PaperAuthor links."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, _ = get_or_create_paper(
+ paper_id="p9999",
+ url="https://example.com/p9999.pdf",
+ title="Multi-author paper",
+ document_date=None,
+ mailing=mailing,
+ author_names=["First Author", "Second Author", "Third Author"],
+ year=2025,
+ )
+ links = list(paper.authors.order_by("author_order"))
+ assert len(links) == 3
+ assert links[0].author_order == 1
+ assert links[1].author_order == 2
+ assert links[2].author_order == 3
+
+
+# --- mark_paper_downloaded ---
+
+
+@pytest.mark.django_db
+def test_mark_paper_downloaded_requires_year(db):
+ """mark_paper_downloaded raises ValueError when year is omitted."""
+ with pytest.raises(ValueError, match="year is required"):
+ mark_paper_downloaded("p1000r0")
+
+
+@pytest.mark.django_db
+def test_mark_paper_downloaded_sets_flag(db):
+ """mark_paper_downloaded sets is_downloaded=True for matching (paper_id, year)."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, _ = get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/p.pdf",
+ title="T",
+ document_date=None,
+ mailing=mailing,
+ year=2025,
+ )
+ assert paper.is_downloaded is False
+ mark_paper_downloaded("p1000r0", year=2025)
+ paper.refresh_from_db()
+ assert paper.is_downloaded is True
+
+
+@pytest.mark.django_db
+def test_mark_paper_downloaded_normalizes_paper_id(db):
+ """mark_paper_downloaded matches case-insensitively (normalizes to lower) and by year."""
+ mailing, _ = get_or_create_mailing("2025-01", "Title")
+ paper, _ = get_or_create_paper(
+ paper_id="p1000r0",
+ url="https://example.com/p.pdf",
+ title="T",
+ document_date=None,
+ mailing=mailing,
+ year=2025,
+ )
+ mark_paper_downloaded(" P1000R0 ", year=2025)
+ paper.refresh_from_db()
+ assert paper.is_downloaded is True
diff --git a/wg21_paper_tracker/tests/test_workspace.py b/wg21_paper_tracker/tests/test_workspace.py
new file mode 100644
index 00000000..09986dff
--- /dev/null
+++ b/wg21_paper_tracker/tests/test_workspace.py
@@ -0,0 +1,86 @@
+"""Tests for wg21_paper_tracker.workspace."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from wg21_paper_tracker.workspace import get_workspace_root, get_raw_dir
+
+
+@pytest.fixture
+def mock_workspace_path(tmp_path):
+ """Patch get_workspace_path to return tmp_path for app slugs."""
+
+ def _get_path(app_slug):
+ p = tmp_path / app_slug.replace("/", "_")
+ p.mkdir(parents=True, exist_ok=True)
+ return p
+
+ with patch(
+ "wg21_paper_tracker.workspace.get_workspace_path",
+ side_effect=_get_path,
+ ):
+ yield tmp_path
+
+
+def test_get_workspace_root_returns_path(mock_workspace_path):
+ """get_workspace_root returns Path for app workspace."""
+ root = get_workspace_root()
+ assert "wg21_paper_tracker" in str(root)
+ assert root.is_dir()
+
+
+def test_get_workspace_root_calls_get_workspace_path_with_slug():
+ """get_workspace_root calls get_workspace_path with app slug."""
+ with patch("wg21_paper_tracker.workspace.get_workspace_path") as m:
+ m.return_value = Path("/fake/workspace/wg21_paper_tracker")
+ root = get_workspace_root()
+ m.assert_called_once_with("wg21_paper_tracker")
+ assert root == Path("/fake/workspace/wg21_paper_tracker")
+
+
+def test_get_raw_dir_returns_mailing_date_subdir(mock_workspace_path):
+ """get_raw_dir returns RAW_DIR/wg21_paper_tracker///."""
+ with patch("wg21_paper_tracker.workspace.settings") as mock_settings:
+ mock_settings.RAW_DIR = mock_workspace_path
+ path = get_raw_dir("2025-01", 2025)
+ expected = mock_workspace_path / "wg21_paper_tracker" / "2025" / "2025-01"
+ assert path == expected
+ assert path.is_dir()
+
+
+def test_get_raw_dir_creates_parents(mock_workspace_path):
+ """get_raw_dir creates parent directories."""
+ with patch("wg21_paper_tracker.workspace.settings") as mock_settings:
+ mock_settings.RAW_DIR = mock_workspace_path
+ path = get_raw_dir("2026-02", 2026)
+ assert path.exists()
+ assert path.parent.name == "2026"
+ assert path.name == "2026-02"
+
+
+def test_get_raw_dir_idempotent(mock_workspace_path):
+ """get_raw_dir can be called twice for same mailing_date without error."""
+ with patch("wg21_paper_tracker.workspace.settings") as mock_settings:
+ mock_settings.RAW_DIR = mock_workspace_path
+ p1 = get_raw_dir("2025-01", 2025)
+ p2 = get_raw_dir("2025-01", 2025)
+ assert p1 == p2
+ assert p1.parent == p2.parent
+
+
+def test_get_raw_dir_rejects_invalid_mailing_date():
+ """get_raw_dir raises ValueError for non-YYYY-MM mailing_date (path traversal, etc.)."""
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("../../tmp", 2025)
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("2025", 2025)
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("2025-1", 2025)
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("2025-13", 2025)
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("2025-00", 2025)
+ with pytest.raises(ValueError, match="mailing_date must be in YYYY-MM format"):
+ get_raw_dir("", 2025)
diff --git a/wg21_paper_tracker/workspace.py b/wg21_paper_tracker/workspace.py
new file mode 100644
index 00000000..62ec55ef
--- /dev/null
+++ b/wg21_paper_tracker/workspace.py
@@ -0,0 +1,36 @@
+"""
+Workspace paths for wg21_paper_tracker.
+Temporary file storage during download before uploading to GCS.
+"""
+
+import re
+from pathlib import Path
+
+from django.conf import settings
+
+from config.workspace import get_workspace_path
+
+_APP_SLUG = "wg21_paper_tracker"
+_RAW_APP_SLUG = f"raw/{_APP_SLUG}"
+_MAILING_DATE_RE = re.compile(r"^\d{4}-(0[1-9]|1[0-2])$")
+
+
+def get_workspace_root() -> Path:
+ return get_workspace_path(_APP_SLUG)
+
+
+def get_raw_dir(mailing_date: str | None, year: int) -> Path:
+ """Return workspace/raw/wg21_paper_tracker///; creates if missing."""
+ if mailing_date is not None and not _MAILING_DATE_RE.fullmatch(mailing_date):
+ raise ValueError("mailing_date must be in YYYY-MM format")
+ if getattr(settings, "RAW_DIR", None):
+ raw_root = Path(settings.RAW_DIR) / _APP_SLUG
+ else:
+ raw_root = get_workspace_path(_RAW_APP_SLUG)
+ raw_root.mkdir(parents=True, exist_ok=True)
+ if mailing_date:
+ path = raw_root / str(year) / mailing_date
+ else:
+ path = raw_root / str(year)
+ path.mkdir(parents=True, exist_ok=True)
+ return path
diff --git a/workflow/management/commands/run_all_collectors.py b/workflow/management/commands/run_all_collectors.py
index dc531880..a3d3e5e3 100644
--- a/workflow/management/commands/run_all_collectors.py
+++ b/workflow/management/commands/run_all_collectors.py
@@ -22,6 +22,7 @@
"run_boost_mailing_list_tracker",
"run_clang_github_tracker",
"run_discord_exporter",
+ "run_wg21_paper_tracker",
"run_cppa_youtube_script_tracker",
]