Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e3c1a65
Initial wg21-paper-tracker added
leostar0412 Mar 9, 2026
9892a45
wg21_paper_tracker: features, tests, and cleanup #24
leostar0412 Mar 10, 2026
18f07c3
Fix lint/format error #24
leostar0412 Mar 10, 2026
f4388ff
Validate mailing_date in get_raw_dir; WG21 author order/resolution an…
leostar0412 Mar 10, 2026
62d5d42
Fix: WG21 tracker (year, GCS guard, IntegrityError), author_alias, Pi…
leostar0412 Mar 10, 2026
e3e91c8
Fix: WG21 – optional Cloud Run, per-blob isolation, PDF priority, yea…
leostar0412 Mar 10, 2026
2159f53
wg21: fix author_alias migration default, fail job when bucket unset,…
leostar0412 Mar 11, 2026
be392a0
wg21: honor settings.RAW_DIR for raw paper storage #24
leostar0412 Mar 11, 2026
005278a
Fix: lint/format error #24
leostar0412 Mar 11, 2026
3547652
fix(openai_converter): use neutral page placeholder for failed pages #24
leostar0412 Mar 11, 2026
7403033
Fix: doc and converter fixes #24
leostar0412 Mar 11, 2026
c33c475
Fix: default sqlite, document #24
leostar0412 Mar 11, 2026
93ee8b7
Fix: author profile merge avoidance, blank paper_id rejection, mailin…
leostar0412 Mar 11, 2026
61a6c7f
Fix: author profile merge avoidance, blank paper_id rejection, pipeli…
leostar0412 Mar 11, 2026
c246241
Fix: OpenRouter retries, CSV year from parsed date, placeholder race …
leostar0412 Mar 11, 2026
c79c1dd
Merge branch 'develop' into dev-24
leostar0412 Mar 18, 2026
3e32ee2
refactor(wg21): pipeline dispatch + mailing range; remove Cloud Run s…
leostar0412 Mar 21, 2026
637b0e8
Merge remote-tracking branch 'origin/dev-24' into dev-24
leostar0412 Mar 21, 2026
818dcaf
Remove migration #24
leostar0412 Mar 21, 2026
4eb5c9b
wg21 paper updates, WG21 profile test fix, revert separate test DB UR…
leostar0412 Mar 27, 2026
22d740e
Merge branch 'develop' into dev-24
snowfox1003 Apr 20, 2026
57d9334
Fix: lint/format error
leostar0412 Apr 20, 2026
e122eb7
Fix: compose error
leostar0412 Apr 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"clang_github_tracker",
"cppa_slack_tracker",
"discord_activity_tracker",
"wg21_paper_tracker",
"cppa_youtube_script_tracker",
"slack_event_handler",
]
Expand Down Expand Up @@ -160,6 +161,7 @@
"cppa_slack_tracker",
"discord_activity_tracker",
"boost_mailing_list_tracker",
"wg21_paper_tracker",
"cppa_youtube_script_tracker",
"shared",
)
Expand Down Expand Up @@ -451,6 +453,16 @@ def _slack_team_scope_from_env():
)
).resolve()

# WG21 Paper Tracker Configuration
WG21_GITHUB_DISPATCH_ENABLED = env.bool("WG21_GITHUB_DISPATCH_ENABLED", default=False)
WG21_GITHUB_DISPATCH_REPO = (env("WG21_GITHUB_DISPATCH_REPO", default="") or "").strip()
WG21_GITHUB_DISPATCH_TOKEN = (
env("WG21_GITHUB_DISPATCH_TOKEN", default="") or ""
).strip()
WG21_GITHUB_DISPATCH_EVENT_TYPE = (
env("WG21_GITHUB_DISPATCH_EVENT_TYPE", default="wg21_papers_convert") or ""
).strip() or "wg21_papers_convert"

# Logging - project-wide configuration for app commands (console + rotating file)
LOG_DIR = Path(env("LOG_DIR", default=str(BASE_DIR / "logs")))
LOG_FILE = env("LOG_FILE", default="app.log")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 4.2.28

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("cppa_user_tracker", "0004_alter_slackuser_slack_user_id_and_more"),
]

operations = [
migrations.AddField(
model_name="wg21paperauthorprofile",
name="author_alias",
field=models.CharField(blank=True, db_index=True, default="", max_length=255),
preserve_default=False,
),
Comment thread
leostar0412 marked this conversation as resolved.
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Merge parallel branches from 0004: WG21 author_alias vs YouTube speaker chain.

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("cppa_user_tracker", "0005_wg21paperauthorprofile_author_alias"),
("cppa_user_tracker", "0007_youtubespeaker_external_id"),
]

operations = []
1 change: 1 addition & 0 deletions cppa_user_tracker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def save(self, *args, **kwargs):
super().save(*args, **kwargs)

display_name = models.CharField(max_length=255, db_index=True, blank=True)
author_alias = models.CharField(max_length=255, blank=True, db_index=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

Expand Down
37 changes: 36 additions & 1 deletion cppa_user_tracker/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
MailingListProfile,
SlackUser,
DiscordProfile,
WG21PaperAuthorProfile,
YoutubeSpeaker,
)

Expand Down Expand Up @@ -248,7 +249,9 @@ def _get_next_negative_github_account_id() -> int:


@transaction.atomic
def get_or_create_slack_user(user_data: dict[str, Any]) -> tuple[SlackUser, bool]:
def get_or_create_slack_user(
user_data: dict[str, Any],
) -> tuple[SlackUser, bool]:
"""Get or create a SlackUser from Slack API user data. Returns (SlackUser, created).

If the user exists, updates username, display_name, and avatar_url from user_data.
Expand Down Expand Up @@ -353,6 +356,38 @@ def get_or_create_discord_profile(
return profile, created


def get_or_create_wg21_paper_author_profile(
display_name: str,
email: Optional[str] = None,
) -> tuple[WG21PaperAuthorProfile, bool]:
"""Get or create a WG21PaperAuthorProfile by display_name, with optional email disambiguation.

Finds all profiles with the given display_name. If none exist, creates one and adds
email if provided. If one exists, returns it. If multiple exist, and email is
provided, returns the one with that email if any; otherwise returns the first.
"""
display_name_val = (display_name or "").strip()
email_val = (email or "").strip() or None

candidates = list(
Comment thread
leostar0412 marked this conversation as resolved.
WG21PaperAuthorProfile.objects.filter(display_name=display_name_val).order_by(
"id"
)
)

# Disambiguate by email if provided.
for p in candidates:
if email_val and p.emails.filter(email=email_val).exists():
return p, False
elif not email_val and not p.emails.exists():
return p, False

profile = WG21PaperAuthorProfile.objects.create(display_name=display_name_val)
if email_val:
add_email(profile, email_val, is_primary=True)
return profile, True


def get_or_create_youtube_speaker(
external_id: str,
display_name: str = "",
Expand Down
101 changes: 101 additions & 0 deletions cppa_user_tracker/tests/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
GitHubAccountType,
Identity,
TempProfileIdentityRelation,
WG21PaperAuthorProfile,
)
from cppa_user_tracker import services

Expand Down Expand Up @@ -569,3 +570,103 @@ def test_get_or_create_mailing_list_profile_strips_display_name_and_email():
assert created is True
assert profile.display_name == "Trimmed"
assert profile.emails.filter(email="trimmed@example.com").exists()


# --- get_or_create_wg21_paper_author_profile ---


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_no_candidates_creates():
"""get_or_create_wg21_paper_author_profile creates new profile when none exist."""
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="New Author"
)
assert created is True
assert profile.display_name == "New Author"
assert WG21PaperAuthorProfile.objects.filter(display_name="New Author").count() == 1


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_no_candidates_with_email_adds_email():
"""get_or_create_wg21_paper_author_profile adds email to new profile when provided."""
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Author With Email",
email="author@example.com",
)
assert created is True
assert profile.emails.filter(email="author@example.com").exists()


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_one_candidate_returns_it():
"""get_or_create_wg21_paper_author_profile returns existing profile when exactly one matches."""
existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author")
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Solo Author"
)
assert created is False
assert profile.id == existing.id


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_one_candidate_with_new_email_creates_new_profile():
"""One name match but email not on that profile: creates a new profile with the email.

Disambiguation only returns an existing row when the email matches or when no email
is passed and the candidate has no emails; otherwise a new profile is created.
"""
existing = WG21PaperAuthorProfile.objects.create(display_name="Solo Author")
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Solo Author",
email="solo@example.com",
)
assert created is True
assert profile.id != existing.id
assert profile.display_name == "Solo Author"
assert profile.emails.filter(email="solo@example.com").exists()
assert (
WG21PaperAuthorProfile.objects.filter(display_name="Solo Author").count() == 2
)
assert not existing.emails.filter(email="solo@example.com").exists()


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_two_candidates_no_email_returns_first():
"""get_or_create_wg21_paper_author_profile returns first profile when multiple match and no email."""
first = WG21PaperAuthorProfile.objects.create(display_name="Dup Name")
_second = WG21PaperAuthorProfile.objects.create(display_name="Dup Name")
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Dup Name"
)
assert created is False
assert profile.id == first.id


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_second():
"""get_or_create_wg21_paper_author_profile returns profile with matching email when multiple match."""
_first = WG21PaperAuthorProfile.objects.create(display_name="Same Name")
second = WG21PaperAuthorProfile.objects.create(display_name="Same Name")
services.add_email(second, "match@example.com", is_primary=True)
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Same Name",
email="match@example.com",
)
assert created is False
assert profile.id == second.id


@pytest.mark.django_db
def test_get_or_create_wg21_paper_author_profile_two_candidates_email_matches_none_creates_new_profile():
"""When multiple match and email matches none, a new profile is created with that email."""
first = WG21PaperAuthorProfile.objects.create(display_name="Other Name")
second = WG21PaperAuthorProfile.objects.create(display_name="Other Name")
services.add_email(second, "other@example.com", is_primary=True)
profile, created = services.get_or_create_wg21_paper_author_profile(
display_name="Other Name",
email="nomatch@example.com",
)
assert created is True
assert profile.id not in (first.id, second.id)
assert profile.display_name == "Other Name"
assert profile.emails.filter(email="nomatch@example.com").exists()
25 changes: 22 additions & 3 deletions docs/Schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ erDiagram

WG21PaperAuthorProfile {
string display_name "IX"
string author_alias "IX"
datetime created_at
datetime updated_at
}
Expand Down Expand Up @@ -633,29 +634,47 @@ erDiagram
erDiagram
Direction LR
WG21PaperAuthorProfile ||--o{ WG21PaperAuthor : "author"
WG21Mailing ||--o{ WG21Paper : "has"
WG21PaperAuthor }o--|| WG21Paper : "has"

WG21PaperAuthor {
int id PK
int paper_id FK
int profile_id FK
int author_order
datetime created_at
}

WG21Mailing {
int id PK
string mailing_date UK "IX"
string title
datetime created_at
datetime updated_at
}

WG21Paper {
int id PK
string paper_id UK "IX"
string paper_id "IX"
int year "IX"
string url
string title "IX"
date publication_date "IX"
date document_date "IX"
int mailing_id FK "IX"
string subgroup "IX"
boolean is_downloaded "IX"
Comment thread
leostar0412 marked this conversation as resolved.
datetime created_at
datetime updated_at
}
```

**Note:** **WG21PaperAuthorProfile** extends `BaseProfile` (section 1). `profile_id` in WG21PaperAuthor references this profile; each paper can have multiple authors.

**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor.
**Note:** **WG21Mailing** stores information about the mailing release, identified by `mailing_date` (e.g. "2025-03"). `mailing_id` in WG21Paper references this mailing.

**Note:** **WG21Paper** is uniquely identified by the composite `(paper_id, year)`; `paper_id` is not globally unique. The same paper identifier may appear in different years (e.g. revisions).

**Note:** Composite unique constraint should be applied on (`paper_id`, `profile_id`) in WG21PaperAuthor. `author_order` is optional and 1-based; it indicates the order of authors on the paper.

---

Expand Down
69 changes: 69 additions & 0 deletions docs/operations/WG21_GitHub_Dispatch.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# WG21 Paper Tracker → GitHub Actions (`repository_dispatch`)

The Django app **`run_wg21_paper_tracker`** scrapes WG21 mailings and stores paper metadata in the database. It does **not** download PDFs or other documents. When **new** paper rows are created in a run, it can send **one** [repository dispatch](https://docs.github.com/en/rest/repos/repos#create-a-repository-dispatch-event) to another GitHub repository so a workflow there fetches each URL and runs conversion (e.g. PDF → Markdown).

## Environment variables

| Variable | Required | Description |
|----------|----------|-------------|
| `WG21_GITHUB_DISPATCH_ENABLED` | No (default `false`) | Set to `true` to send `repository_dispatch` when there are new papers. |
| `WG21_GITHUB_DISPATCH_REPO` | Yes, if enabled | Target repo as `owner/repo` (the repo whose workflow will run). |
| `WG21_GITHUB_DISPATCH_TOKEN` | Yes, if enabled | PAT or token with permission to create repository dispatch events on that repo (classic PAT: `repo` scope for private repos). |
| `WG21_GITHUB_DISPATCH_EVENT_TYPE` | No | Must match `on.repository_dispatch.types` in the target workflow. Default: `wg21_papers_convert`. |

## `client_payload` contract

The JSON body includes only a list of URL strings:

```json
{
"papers": [
"https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/…",
"https://www.open-std.org/…"
]
}
```

- **`papers`**: array of strings (WG21 document URLs), all new papers from **that** pipeline run in a **single** event.
- There is **no** `new_paper_count` field; use `length(papers)` in the workflow if needed.

## Target repository workflow (example)

```yaml
on:
repository_dispatch:
types: [wg21_papers_convert]

jobs:
convert:
runs-on: ubuntu-latest
steps:
- name: URLs
run: |
echo '${{ toJson(github.event.client_payload.papers) }}'
# Fetch each URL, convert, store artifacts / upload elsewhere
```

In expressions, `github.event.client_payload.papers` is a JSON array of strings.

## Token security

Store `WG21_GITHUB_DISPATCH_TOKEN` in a secret manager or CI secret—never commit it. Prefer a fine-grained PAT scoped to the conversion repo if possible.

## Payload size

Very large mailings could produce many URLs in one payload. If you approach GitHub or runner limits, document a split strategy (multiple dispatches) as an edge case; the default is one dispatch per tracker run with the full list.

## CLI options

- **`--from-date YYYY-MM`**: Process mailings with `mailing_date >= YYYY-MM` (WG21 / CSV style). Backfills from that key onward when used alone.
- **`--to-date YYYY-MM`**: Upper bound: `mailing_date <= YYYY-MM`. With `--from-date`, the run uses the inclusive range `[from, to]`. Without `--from-date`, behavior stays incremental (only mailings **newer than** the latest `WG21Mailing` in the DB), but capped at `to`—useful to avoid pulling very new mailings in a controlled run.
- **`--dry-run`**: Log only; do not run the pipeline or send dispatch.

## Flow summary

1. Scheduler runs `run_wg21_paper_tracker` (optionally with `--from-date` / `--to-date`).
2. Pipeline fetches mailings, upserts `WG21Mailing` / `WG21Paper` (metadata only).
3. For each row **newly created** in that run, its document URL is collected.
4. If the list is non-empty and dispatch is enabled, the app POSTs once to `POST /repos/{owner}/{repo}/dispatches` with `event_type` and `client_payload: { "papers": [ ... ] }`.
5. The conversion repo’s workflow runs and downloads each URL.
8 changes: 8 additions & 0 deletions docs/service_api/cppa_user_tracker.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@

---

## WG21PaperAuthorProfile

| Function | Parameter types | Return type | Description |
| -------------------------------------- | -------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_or_create_wg21_paper_author_profile` | `display_name: str`, `email: str \| None = None` | `tuple[WG21PaperAuthorProfile, bool]` | Resolve by display_name (optional email for disambiguation). If no profile exists, creates one and adds email if provided. If one exists, returns it. If multiple exist and one matches the email, returns that profile. If multiple exist and no email is provided, returns the first. If multiple exist and the supplied email matches none, creates a new profile with that email. **Side effect:** if `email` is supplied and the resolved or created profile does not already have that email, the function associates it with the profile (so existing profiles may be updated). Returns the profile and a boolean indicating creation. Use when linking paper authors so that same name + same email link to the same profile. |

---

## DiscordProfile

| Function | Parameter types | Return type | Description |
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ slack-bolt>=1.18
pytz>=2024.1
selenium>=4.35

# wg21_paper_tracker app
beautifulsoup4>=4.12.0
# cppa_youtube_script_tracker app (YouTube Data API v3 + VTT transcript download)
google-api-python-client>=2.100
yt-dlp==2026.2.4
Expand Down
Empty file added wg21_paper_tracker/__init__.py
Empty file.
Loading
Loading