From 006adfbaba514e61aa1604193d0b5b05ba27f1dd Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 22:59:28 +0800
Subject: [PATCH 1/6] Improve the automatic paper-research workflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Six focused quality / robustness fixes across the search -> dedup -> filter
-> enrich -> download pipeline.

Search quality
- ranking: add a query-relevance axis. Dedup merges each source's locally
  relevance-sorted list and discarded that ordering, so ranking fell back to
  recency + citation alone — surfacing off-topic-but-highly-cited papers above
  on-topic ones. Now score = relevance (query-term overlap with title >>
  abstract, dominant) + recency + damped log10(citations). keywords is
  optional so single-paper callers keep the old behaviour.
- dedup: normalise the title hash (strip punctuation/whitespace) and drop the
  arXiv version suffix, so "Attention Is All You Need" / "...need." and
  2401.00001v1 / v2 collapse to one record instead of splitting.

Filter correctness
- min_citations was accepted (CLI/MCP/Query) and documented as a filter but
  only ever applied by the semantic_scholar plugin. Enforce it in the pipeline
  for every source; papers whose source reports no count are kept (an unknown
  count is not treated as zero).

Concurrency robustness
- enrich_collection: cap simultaneous per-paper enrichments with a semaphore
  (default 4) so a large collection doesn't fire one Anthropic call + one PDF
  download per paper at once and trip the API rate limit.
- download_pdfs: same semaphore cap, so several sources handing back PDF URLs
  on the same publisher CDN don't burst it into an IP block.

Cleanup
- pipeline._enrich_one: drop the redundant `except (ThesisAgentsError,
  Exception)` (Exception already covers it).

Gates: 588 tests pass (+10), ruff + bandit clean.
---
 tests/test_dedup_ranking.py       | 72 +++++++++++++++++++++++++
 tests/test_pdf_download.py        | 25 +++++++++
 tests/test_pipeline.py            | 54 +++++++++++++++++++
 thesisagents/core/models.py       | 30 +++++++++--
 thesisagents/core/pdf_download.py | 25 +++++++--
 thesisagents/core/pipeline.py     | 46 ++++++++++++++--
 thesisagents/core/ranking.py      | 89 ++++++++++++++++++++++++++-----
 thesisagents/mcp/server.py        |  6 ++-
 8 files changed, 322 insertions(+), 25 deletions(-)

diff --git a/tests/test_dedup_ranking.py b/tests/test_dedup_ranking.py
index 5cddb3e..da7e3d9 100644
--- a/tests/test_dedup_ranking.py
+++ b/tests/test_dedup_ranking.py
@@ -82,6 +82,23 @@ def test_dedupe_merges_multiple_optional_fields():
     assert out[0].year == 2024
 
 
+def test_dedupe_collapses_title_punctuation_and_case():
+    """Same paper from two sources differing only by trailing period / case
+    collapses to one record (no DOI / arXiv ID to key on)."""
+    a = _paper(source_id="1", title="Attention Is All You Need",
+               authors=("Vaswani",), year=2017)
+    b = _paper(source_id="2", title="Attention is all you need.",
+               authors=("Vaswani",), year=2017)
+    assert len(dedupe([a, b])) == 1
+
+
+def test_dedupe_collapses_arxiv_versions():
+    """Two arXiv versions of the same paper (v1, v2) are one paper."""
+    v1 = _paper(source_id="1", arxiv_id="2401.00001v1")
+    v2 = _paper(source_id="2", arxiv_id="2401.00001v2")
+    assert len(dedupe([v1, v2])) == 1
+
+
 def test_rank_prefers_recent_papers():
     old = _paper(source_id="old", year=2010)
     new = _paper(source_id="new", year=2024)
@@ -101,3 +118,58 @@ def test_rank_handles_missing_year():
     b = _paper(source_id="b", year=2024)
     ordered = rank([a, b], current_year=2025)
     assert ordered[0].source_id == "b"
+
+
+def test_rank_relevance_outranks_off_topic_high_citation():
+    """An on-topic paper beats an off-topic, far-more-cited one — relevance is
+    the dominant signal (the failure this whole feature prevents)."""
+    on_topic = _paper(
+        source_id="on", title="A Survey of Transformer Attention",
+        year=2024, citation_count=10,
+    )
+    off_topic = _paper(
+        source_id="off", title="Deep Residual Learning for Image Recognition",
+        year=2024, citation_count=100_000,
+    )
+    ordered = rank(
+        [off_topic, on_topic], keywords="transformer attention", current_year=2025
+    )
+    assert ordered[0].source_id == "on"
+
+
+def test_rank_title_match_outranks_abstract_match():
+    """A query term in the title weighs more than the same term in the abstract."""
+    title_match = _paper(
+        source_id="title", title="Transformer Attention Models",
+        abstract="", year=2024,
+    )
+    abstract_match = _paper(
+        source_id="abs", title="A New Method",
+        abstract="we study transformer attention", year=2024,
+    )
+    ordered = rank(
+        [abstract_match, title_match],
+        keywords="transformer attention", current_year=2025,
+    )
+    assert ordered[0].source_id == "title"
+
+
+def test_rank_without_keywords_ignores_relevance():
+    """No keywords -> relevance axis off; back-compat recency+citation only."""
+    on_topic_but_old = _paper(
+        source_id="on", title="transformer attention", year=2010, citation_count=1,
+    )
+    recent_unrelated = _paper(
+        source_id="recent", title="unrelated topic", year=2024, citation_count=1,
+    )
+    ordered = rank([on_topic_but_old, recent_unrelated], current_year=2025)
+    assert ordered[0].source_id == "recent"  # recency wins; title text ignored
+
+
+def test_rank_short_query_terms_are_dropped():
+    """Stop-word-ish terms (<3 chars) don't create spurious matches."""
+    # "a"/"of" must not match; only "llm" (3 chars) counts.
+    hit = _paper(source_id="hit", title="On LLM agents", year=2024)
+    miss = _paper(source_id="miss", title="A study of birds", year=2024)
+    ordered = rank([miss, hit], keywords="a llm of", current_year=2025)
+    assert ordered[0].source_id == "hit"
diff --git a/tests/test_pdf_download.py b/tests/test_pdf_download.py
index 191db99..a42949f 100644
--- a/tests/test_pdf_download.py
+++ b/tests/test_pdf_download.py
@@ -340,3 +340,28 @@ async def test_cookies_only_attach_to_matching_host(
     cookie_header = transport.request_headers[0].get("cookie", "")
     assert "SECRET" not in cookie_header
     assert "do-not-leak" not in cookie_header
+
+
+async def test_download_pdfs_caps_concurrency(tmp_path: Path, monkeypatch):
+    """A semaphore bounds simultaneous downloads so a big collection doesn't
+    hammer a single publisher CDN with one request per paper at once."""
+    import asyncio
+
+    active = 0
+    peak = 0
+
+    async def fake_download_one(paper, pdf_dir):
+        nonlocal active, peak
+        active += 1
+        peak = max(peak, active)
+        await asyncio.sleep(0.01)
+        active -= 1
+        return pdf_download_module.PdfDownloadResult(
+            paper_key=paper.source_id, path=None, skipped_reason=None
+        )
+
+    monkeypatch.setattr(pdf_download_module, "_download_one", fake_download_one)
+    papers = [_paper(source_id=str(i), pdf_url="https://e/p.pdf") for i in range(8)]
+    results = await download_pdfs(_collection(*papers), tmp_path, concurrency=3)
+    assert len(results) == 8
+    assert 1 <= peak <= 3  # never more than the cap in flight at once
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 0f7a376..f5404a8 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -268,3 +268,57 @@ async def test_run_search_top_tier_only_default_false(monkeypatch):
     collection = await pipeline_module.run_search(query)
     assert len(collection.papers) == 1
     assert collection.papers[0].title == "Low paper"
+
+
+async def test_run_search_min_citations_filters_all_sources(monkeypatch):
+    """min_citations is enforced in the pipeline for every source (not only
+    semantic_scholar). Papers with an unknown (None) citation count are kept."""
+    def _cited(source_id, count):
+        return Paper(
+            source="arxiv", source_id=source_id, title=source_id, authors=(),
+            year=2024, venue=None, abstract="",
+            url=f"https://example.com/{source_id}", citation_count=count,
+        )
+
+    high = _cited("high", 100)
+    low = _cited("low", 5)
+    unknown = _cited("unknown", None)
+    pool = {"arxiv": _make_fetcher("arxiv", [high, low, unknown])}
+    monkeypatch.setattr(pipeline_module, "load_fetcher", lambda name: pool[name])
+    query = Query(
+        keywords="x", sources=("arxiv",), max_results=10, min_citations=50
+    )
+    collection = await pipeline_module.run_search(query, resolve_oa=False)
+    ids = {p.source_id for p in collection.papers}
+    assert "high" in ids          # 100 >= 50
+    assert "unknown" in ids        # unknown count kept, not dropped
+    assert "low" not in ids        # 5 < 50 filtered out
+
+
+async def test_enrich_collection_caps_concurrency(monkeypatch):
+    """A semaphore bounds how many per-paper enrichments run at once, so a big
+    collection doesn't fire one Anthropic call + one PDF download per paper."""
+    import asyncio
+
+    from thesisagents.core.models import PaperCollection
+
+    active = 0
+    peak = 0
+
+    async def fake_enrich_one(paper, *, language, model):
+        nonlocal active, peak
+        active += 1
+        peak = max(peak, active)
+        await asyncio.sleep(0.01)
+        active -= 1
+        return paper
+
+    monkeypatch.setattr(pipeline_module, "_enrich_one", fake_enrich_one)
+    papers = tuple(_paper("arxiv", str(i), f"t{i}") for i in range(8))
+    collection = PaperCollection(
+        query=Query(keywords="x", sources=("arxiv",), max_results=8),
+        papers=papers,
+    )
+    out = await pipeline_module.enrich_collection(collection, concurrency=3)
+    assert len(out.papers) == 8
+    assert 1 <= peak <= 3  # never more than the cap in flight at once
diff --git a/thesisagents/core/models.py b/thesisagents/core/models.py
index 913d61f..067b76f 100644
--- a/thesisagents/core/models.py
+++ b/thesisagents/core/models.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import hashlib
+import re
 from dataclasses import dataclass, field, replace
 from typing import Any
 
@@ -12,6 +13,19 @@
     MAX_RESULTS_PER_SOURCE,
 )
 
+_TITLE_NOISE_RE = re.compile(r"[^a-z0-9]+")
+
+
+def _canon_title(title: str) -> str:
+    """Canonical title form for dedup hashing: lowercase with every
+    non-alphanumeric run (punctuation, whitespace) stripped.
+
+    ``"Attention Is All You Need"``, ``"Attention is all you need."`` and
+    ``"Attention—Is All You Need"`` all map to ``"attentionisallyouneed"``,
+    so cross-source punctuation/spacing noise collapses into one dedup key.
+    """
+    return _TITLE_NOISE_RE.sub("", title.lower())
+
 
 @dataclass(frozen=True, slots=True)
 class RqResult:
@@ -149,13 +163,23 @@ def dedup_key(self) -> str:
         Why: Google Scholar and Semantic Scholar may return the same paper with
         different source-side IDs. DOI is best; arXiv ID next; otherwise hash
         title + first-author + year.
+
+        Each key component is *normalised* so cosmetic cross-source differences
+        don't split one paper into two records:
+
+        * arXiv IDs drop the version suffix — ``2401.00001v1`` and
+          ``2401.00001v2`` are the same paper (a revision), so they collapse.
+        * Title hashes strip punctuation and whitespace via
+          :func:`_canon_title`, so ``"Attention Is All You Need"`` and
+          ``"Attention is all you need."`` produce the same key.
         """
         if self.doi:
             return f"doi:{self.doi.lower()}"
         if self.arxiv_id:
-            return f"arxiv:{self.arxiv_id.lower()}"
-        first_author = self.authors[0].lower() if self.authors else ""
-        seed = f"{self.title.strip().lower()}|{first_author}|{self.year or ''}"
+            arxiv = re.sub(r"v\d+$", "", self.arxiv_id.strip().lower())
+            return f"arxiv:{arxiv}"
+        first_author = self.authors[0].strip().lower() if self.authors else ""
+        seed = f"{_canon_title(self.title)}|{first_author}|{self.year or ''}"
         digest = hashlib.sha256(seed.encode("utf-8"), usedforsecurity=False).hexdigest()
         return f"hash:{digest[:16]}"
 
diff --git a/thesisagents/core/pdf_download.py b/thesisagents/core/pdf_download.py
index e08532c..87f0c9e 100644
--- a/thesisagents/core/pdf_download.py
+++ b/thesisagents/core/pdf_download.py
@@ -33,6 +33,13 @@
 _MAX_PDF_BYTES: int = 50_000_000
 _MAX_FILENAME_LENGTH: int = 80
 
+#: Cap on simultaneously-downloading PDFs. Each per-source client already has
+#: its own token bucket, but several sources can hand back PDF URLs on the SAME
+#: publisher CDN (dl.acm.org, link.springer.com, …) under different source
+#: clients — bypassing any single bucket. A global cap keeps that burst from
+#: looking like a scrape and risking an IP block. 4 keeps the stage brisk.
+_DOWNLOAD_CONCURRENCY: int = 4
+
 #: Real browser User-Agent — many publisher CDNs (Elsevier, IEEE Xplore,
 #: Springer) return 403 to non-browser UAs even for open-access PDFs.
 _BROWSER_UA = (
@@ -52,17 +59,29 @@ class PdfDownloadResult:
 
 
 async def download_pdfs(
-    collection: PaperCollection, out_dir: str | Path
+    collection: PaperCollection,
+    out_dir: str | Path,
+    *,
+    concurrency: int = _DOWNLOAD_CONCURRENCY,
 ) -> list[PdfDownloadResult]:
     """Download every paper's PDF into ``{out_dir}/pdfs/``.
 
     Returns a result per paper so callers can summarise failures. Papers
-    without a ``pdf_url`` are skipped with reason ``no_pdf_url``.
+    without a ``pdf_url`` are skipped with reason ``no_pdf_url``. A semaphore
+    caps how many downloads run at once (``concurrency``, default
+    ``_DOWNLOAD_CONCURRENCY``) so a large collection doesn't hammer a single
+    publisher CDN with one request per paper simultaneously.
     """
     root = ensure_export_dir(out_dir)
     pdf_dir = ensure_export_dir(root / _PDF_SUBDIR)
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _bounded(paper: Paper) -> PdfDownloadResult:
+        async with sem:
+            return await _download_one(paper, pdf_dir)
+
     results = await asyncio.gather(
-        *(_download_one(paper, pdf_dir) for paper in collection.papers)
+        *(_bounded(paper) for paper in collection.papers)
     )
     return list(results)
 
diff --git a/thesisagents/core/pipeline.py b/thesisagents/core/pipeline.py
index 48821f0..7fa8c68 100644
--- a/thesisagents/core/pipeline.py
+++ b/thesisagents/core/pipeline.py
@@ -31,6 +31,12 @@
 
 _LOG = get_logger(__name__)
 
+# Cap on simultaneous per-paper enrichment tasks. Each task fetches a PDF over
+# HTTPS and then calls the Anthropic API; without a cap a 25-paper search would
+# fire 25 concurrent API calls (an easy 429 from Anthropic) plus 25 concurrent
+# PDF downloads. 4 keeps throughput high while staying under typical API limits.
+_ENRICH_CONCURRENCY = 4
+
 
 async def run_search(
     query: Query, *, resolve_oa: bool = True
@@ -60,13 +66,30 @@ async def run_search(
     for source_papers in results:
         flat.extend(source_papers)
     unique = dedupe(flat)
-    ordered = rank(unique)
+    ordered = rank(unique, keywords=query.keywords)
     if query.top_tier_only:
         before = len(ordered)
         ordered = [paper for paper in ordered if is_top_tier(paper)]
         _LOG.info(
             "top-tier filter kept %d / %d papers", len(ordered), before
         )
+    if query.min_citations is not None:
+        # Apply min_citations across EVERY source here, not just the one source
+        # (semantic_scholar) that supports it as an API parameter. Papers whose
+        # source doesn't report a citation count (citation_count is None — e.g.
+        # dblp / doaj / hal / arxiv) are KEPT: an unknown count must not be
+        # treated as zero and silently dropped.
+        before = len(ordered)
+        ordered = [
+            paper
+            for paper in ordered
+            if paper.citation_count is None
+            or paper.citation_count >= query.min_citations
+        ]
+        _LOG.info(
+            "min-citations(>=%d) filter kept %d / %d papers (unknown counts kept)",
+            query.min_citations, len(ordered), before,
+        )
     collection = PaperCollection(
         query=query, papers=tuple(ordered[: query.max_results])
     )
@@ -140,16 +163,29 @@ async def enrich_collection(
     *,
     language: str = "en",
     model: str | None = None,
+    concurrency: int = _ENRICH_CONCURRENCY,
 ) -> PaperCollection:
     """Download each paper's PDF, summarise it with an LLM, attach the result.
 
     Papers without a ``pdf_url`` or whose PDF can't be fetched / parsed pass
     through unchanged — the exporter then falls back to the abstract-based
-    deck. Each enrichment runs in its own asyncio task so total wall-clock
-    is roughly ``max(per-paper time)`` rather than the sum.
+    deck. Enrichments run concurrently but a semaphore caps how many run at
+    once (``concurrency``, default ``_ENRICH_CONCURRENCY``) so a large
+    collection doesn't fire one Anthropic API call + one PDF download per paper
+    all at once and trip the API's rate limit. Total wall-clock is roughly
+    ``ceil(n / concurrency) * per-paper time``.
+
+    Example: ``enrich_collection(coll, concurrency=2)`` processes at most two
+    papers in flight at a time.
     """
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _bounded(paper: Paper) -> Paper:
+        async with sem:
+            return await _enrich_one(paper, language=language, model=model)
+
     enriched_papers = await asyncio.gather(
-        *(_enrich_one(paper, language=language, model=model) for paper in collection.papers)
+        *(_bounded(paper) for paper in collection.papers)
     )
     return PaperCollection(query=collection.query, papers=tuple(enriched_papers))
 
@@ -179,7 +215,7 @@ async def _enrich_one(
         summary = await asyncio.to_thread(
             summarise_paper, paper, pdf, language=language, model=model
         )
-    except (ThesisAgentsError, Exception) as err:  # noqa: BLE001  # API client raises various types
+    except Exception as err:  # noqa: BLE001  # anthropic client raises various, incl. ThesisAgentsError
         _LOG.warning("summarisation failed for %s: %s", paper.bibtex_key(), err)
         return paper
     if summary.is_empty():
diff --git a/thesisagents/core/ranking.py b/thesisagents/core/ranking.py
index 79fd130..32a4dc3 100644
--- a/thesisagents/core/ranking.py
+++ b/thesisagents/core/ranking.py
@@ -1,34 +1,99 @@
-"""Simple relevance + recency + citation ranking.
+"""Relevance + recency + citation ranking.
 
-The signal is intentionally lightweight — sources already return roughly
-relevance-sorted lists. We just stable-merge them and tie-break with recency
-and citation count.
+Each source returns a roughly relevance-sorted list, but de-duplication merges
+those lists across sources and discards each source's local ordering. Without an
+explicit relevance signal the merged list would sort purely by recency +
+citation — which buries a highly-relevant older paper under a recent, more-cited,
+*off-topic* one. (Concrete failure this prevents: a search for "transformer
+attention" that surfaces a 100k-citation ResNet paper above an on-topic survey,
+simply because ResNet is cited more.)
+
+So each paper is scored on three axes and stable-sorted by the sum:
+
+* **relevance** (dominant) — overlap between the query keywords and the paper's
+  title (weighted heavily) + abstract (weighted lightly). Research starts from
+  "is this on my topic?", so an on-topic paper should beat an off-topic one even
+  when the off-topic one is older-and-more-cited.
+* **recency** — exponential decay over paper age (~5-year scale).
+* **citation** — ``log10`` of the citation count (diminishing returns), then
+  damped by ``_CITATION_WEIGHT`` so a huge citation count is a strong tie-break
+  but cannot by itself outrank an on-topic title.
+
+``keywords`` is optional: callers that rank a single fetched paper (no query, e.g.
+``run_single_paper``) pass ``None`` and keep the pre-relevance recency+citation
+behaviour.
 """
 
 from __future__ import annotations
 
 import math
+import re
 from collections.abc import Iterable
 
 from thesisagents.core.models import Paper
 
 _CURRENT_YEAR_FALLBACK = 2026
 
+# Relevance weights. Title overlap dominates; abstract overlap is a softer
+# secondary signal. Tuned together with _CITATION_WEIGHT so that a full
+# title match (= _TITLE_WEIGHT) outranks even a ~100k-citation off-topic paper
+# (citation term ≈ _CITATION_WEIGHT * log10(1e5) = 0.4 * 5 = 2.0 < 3.0).
+_TITLE_WEIGHT = 3.0
+_ABSTRACT_WEIGHT = 0.6
+# Damping on the log10 citation term. Keeps "most-cited" a meaningful tie-break
+# among similarly-relevant papers without letting it swamp the topic signal.
+_CITATION_WEIGHT = 0.4
+# Query/title tokens shorter than this are dropped as stop-word-ish noise
+# ("a", "of", "is", "the"). 3 keeps useful short terms like "llm", "rag", "gan".
+_MIN_TERM_LEN = 3
+
+_WORD_RE = re.compile(r"[a-z0-9]+")
 
-def rank(papers: Iterable[Paper], current_year: int | None = None) -> list[Paper]:
-    """Stable sort by descending composite score."""
+
+def rank(
+    papers: Iterable[Paper],
+    keywords: str | None = None,
+    current_year: int | None = None,
+) -> list[Paper]:
+    """Stable sort by descending composite score (relevance + recency + citation).
+
+    ``keywords`` is the raw query string; when given, papers whose title /
+    abstract share terms with it rank higher. ``None`` disables the relevance
+    axis (single-paper / query-less callers).
+    """
     year_base = current_year or _CURRENT_YEAR_FALLBACK
+    terms = _terms(keywords) if keywords else frozenset()
     return sorted(
         papers,
-        key=lambda paper: _score(paper, year_base),
+        key=lambda paper: _score(paper, year_base, terms),
         reverse=True,
     )
 
 
-def _score(paper: Paper, current_year: int) -> float:
-    recency = _recency_score(paper.year, current_year)
-    citation = _citation_score(paper.citation_count)
-    return recency + citation
+def _score(paper: Paper, current_year: int, terms: frozenset[str]) -> float:
+    return (
+        _relevance_score(paper, terms)
+        + _recency_score(paper.year, current_year)
+        + _citation_score(paper.citation_count)
+    )
+
+
+def _terms(text: str) -> frozenset[str]:
+    """Lowercase alphanumeric tokens of length >= ``_MIN_TERM_LEN``."""
+    return frozenset(w for w in _WORD_RE.findall(text.lower()) if len(w) >= _MIN_TERM_LEN)
+
+
+def _relevance_score(paper: Paper, terms: frozenset[str]) -> float:
+    """Fraction of query terms appearing in title / abstract, title-weighted.
+
+    Range ``[0, _TITLE_WEIGHT + _ABSTRACT_WEIGHT]``. ``0.0`` when no query terms
+    were supplied, so the score reduces to recency + citation.
+    """
+    if not terms:
+        return 0.0
+    title_hit = len(terms & _terms(paper.title)) / len(terms)
+    abstract_hit = len(terms & _terms(paper.abstract)) / len(terms)
+    return title_hit * _TITLE_WEIGHT + abstract_hit * _ABSTRACT_WEIGHT
 
 
 def _recency_score(year: int | None, current_year: int) -> float:
@@ -41,4 +106,4 @@ def _recency_score(year: int | None, current_year: int) -> float:
 def _citation_score(citations: int | None) -> float:
     if citations is None or citations <= 0:
         return 0.0
-    return math.log10(citations + 1.0)
+    return _CITATION_WEIGHT * math.log10(citations + 1.0)
diff --git a/thesisagents/mcp/server.py b/thesisagents/mcp/server.py
index 3c4f340..d86465f 100644
--- a/thesisagents/mcp/server.py
+++ b/thesisagents/mcp/server.py
@@ -274,8 +274,10 @@ async def search(
         conferences + Nature / Science / PNAS / CACM / LNCS). arXiv
         preprints always pass through. Pass ``False`` for a broader net.
 
-        ``min_citations`` filters out papers below the threshold; pass
-        ``None`` for no minimum.
+        ``min_citations`` filters out papers whose citation count is below
+        the threshold — enforced across every source in the pipeline, with
+        papers whose source reports no count kept (not treated as zero).
+        Pass ``None`` for no minimum.
 
         Returns a JSON-serialisable dict with a `papers` list. Each paper has
         the same fields as Paper.to_dict() — pass the list straight to `export`.

From aa3390b1a4a5500a81c2e2d09487d49b8a689c36 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 23:12:12 +0800
Subject: [PATCH 2/6] Further paper-research workflow improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continuation of the workflow-quality pass.

- dedup: canonicalise the first-author surname so name-format differences
  ("Vaswani, Ashish" / "Ashish Vaswani" / "A. Vaswani") don't split one
  DOI-less paper into duplicates — complements the title/arXiv-version canon.
- pipeline: enforce the year range for ALL sources after dedup/rank, not only
  inside the source plugins. Scrape sources (scholar / ieee) filter loosely,
  so a pipeline guard keeps the result within [year_from, year_to]; papers
  with an unknown year are kept.
- cli: expose --min-citations. The flag and the Query wiring were both missing,
  so the filter (now pipeline-wide) was unreachable from the CLI. docs/cli.md
  updated.

Gates: 592 tests pass (+4), ruff + bandit clean.
---
 docs/cli.md                   |  3 ++-
 tests/test_cli.py             | 22 ++++++++++++++++++++++
 tests/test_dedup_ranking.py   | 20 ++++++++++++++++++++
 tests/test_pipeline.py        | 24 ++++++++++++++++++++++++
 thesisagents/cli.py           | 11 +++++++++++
 thesisagents/core/models.py   | 24 +++++++++++++++++++++++-
 thesisagents/core/pipeline.py | 32 ++++++++++++++++++++++++++++++++
 7 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 0b05874..f06c014 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -15,7 +15,7 @@ exclusive modes:
 thesisagents (--query KEYWORDS | --paper IDENTIFIER)
                 [--source SOURCES] [--exclude-source SOURCES]
                 [--max N]
-                [--year-from YEAR] [--year-to YEAR]
+                [--year-from YEAR] [--year-to YEAR] [--min-citations N]
                 [--export FORMATS]
                 [--out DIR]
                 [--filename-stem STEM]
@@ -39,6 +39,7 @@ thesisagents (--query KEYWORDS | --paper IDENTIFIER)
 | `--exclude-source` / `-x` | — | Comma-separated sources to remove from the mix, subtracted **after** `--source` resolves. The no-VPN gesture is to leave `--source` at its default and pass `--exclude-source ieee` — every other default source stays in. An unknown name (or excluding the whole mix) is an error. |
 | `--max` / `-n` | `25` | Range 1..200. |
 | `--year-from`, `--year-to` | — | Inclusive year filter. |
+| `--min-citations` | — | Drop papers below this citation count, enforced across **all** sources (not just Semantic Scholar). Papers whose source reports no count are kept. Omit for no minimum. |
 | `--export` / `-e` | mode-specific | Any of `pptx`, `xlsx`, `md`, `bib`, `json`, `ris`, `csv`, `csl`. **Default with `--query` is `pptx,xlsx,bib`; default with `--paper` is `pptx,bib`** (one-row Excel is busy work). Explicit `--export` always wins. `ris` (Zotero/Mendeley/EndNote), `csv` (flat table) and `csl` (CSL-JSON for Pandoc, written as `<stem>.csl.json`) are the interchange formats. |
 | `--list-sources`, `--list-exports` | — | Print the available search sources / export formats and exit (no query needed). |
 | `--out` / `-o` | `./exports` | Created if missing. |
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b8c2ed1..a423f4a 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -320,6 +320,28 @@ async def fake_shutdown() -> None:  # NOSONAR async stub
     assert captured["query"].sources == expected
 
 
+def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers):
+    """--min-citations is parsed and passed through to the Query (it was
+    previously unreachable from the CLI)."""
+    captured: dict[str, Query] = {}
+
+    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
+        captured["query"] = query
+        return PaperCollection(query=query, papers=tuple(sample_papers))
+
+    async def fake_shutdown() -> None:
+        return None
+
+    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
+    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
+    code = cli_module.main(
+        ["--query", "x", "--out", str(tmp_path), "--export", "bib",
+         "--min-citations", "50"]
+    )
+    assert code == 0
+    assert captured["query"].min_citations == 50
+
+
 def test_cli_exclude_unknown_source_errors(tmp_path):
     """A typo in --exclude-source must fail loudly, not silently no-op."""
     with pytest.raises(SystemExit):
diff --git a/tests/test_dedup_ranking.py b/tests/test_dedup_ranking.py
index da7e3d9..2a761dc 100644
--- a/tests/test_dedup_ranking.py
+++ b/tests/test_dedup_ranking.py
@@ -99,6 +99,26 @@ def test_dedupe_collapses_arxiv_versions():
     assert len(dedupe([v1, v2])) == 1
 
 
+def test_dedupe_collapses_author_name_formats():
+    """Same DOI-less paper whose first author arrives in three different name
+    formats across sources collapses to one record (surname canonicalisation)."""
+    a = _paper(source_id="1", title="On Method X",
+               authors=("Ashish Vaswani",), year=2020)
+    b = _paper(source_id="2", title="On Method X",
+               authors=("Vaswani, Ashish",), year=2020)
+    c = _paper(source_id="3", title="On Method X",
+               authors=("A. Vaswani",), year=2020)
+    assert len(dedupe([a, b, c])) == 1
+
+
+def test_dedupe_distinguishes_different_surnames():
+    """Different first-author surnames (same title/year) stay distinct — the
+    surname canonicalisation must not over-merge."""
+    a = _paper(source_id="1", title="Common Title", authors=("Alice Smith",), year=2020)
+    b = _paper(source_id="2", title="Common Title", authors=("Bob Jones",), year=2020)
+    assert len(dedupe([a, b])) == 2
+
+
 def test_rank_prefers_recent_papers():
     old = _paper(source_id="old", year=2010)
     new = _paper(source_id="new", year=2024)
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index f5404a8..304063b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -295,6 +295,30 @@ def _cited(source_id, count):
     assert "low" not in ids        # 5 < 50 filtered out
 
 
+async def test_run_search_year_range_pipeline_guard(monkeypatch):
+    """The pipeline enforces the year range even when a source returns
+    out-of-range papers (scrape sources filter loosely); unknown years kept."""
+    def _yr(sid, year):
+        return Paper(
+            source="arxiv", source_id=sid, title=sid, authors=(), year=year,
+            venue=None, abstract="", url=f"https://example.com/{sid}",
+        )
+
+    in_range = _yr("in", 2022)
+    too_old = _yr("old", 2010)
+    too_new = _yr("new", 2025)
+    unknown = _yr("unk", None)
+    pool = {"arxiv": _make_fetcher("arxiv", [in_range, too_old, too_new, unknown])}
+    monkeypatch.setattr(pipeline_module, "load_fetcher", lambda name: pool[name])
+    query = Query(
+        keywords="x", sources=("arxiv",), max_results=10,
+        year_from=2020, year_to=2024,
+    )
+    collection = await pipeline_module.run_search(query, resolve_oa=False)
+    ids = {p.source_id for p in collection.papers}
+    assert ids == {"in", "unk"}  # in-range + unknown kept; old/new dropped
+
+
 async def test_enrich_collection_caps_concurrency(monkeypatch):
     """A semaphore bounds how many per-paper enrichments run at once, so a big
     collection doesn't fire one Anthropic call + one PDF download per paper."""
diff --git a/thesisagents/cli.py b/thesisagents/cli.py
index 99161d3..67797d9 100644
--- a/thesisagents/cli.py
+++ b/thesisagents/cli.py
@@ -184,6 +184,16 @@ def build_parser() -> argparse.ArgumentParser:
         default=None,
         help="Latest publication year (inclusive).",
     )
+    parser.add_argument(
+        "--min-citations",
+        type=int,
+        default=None,
+        help=(
+            "Drop papers with fewer than this many citations (enforced across "
+            "all sources). Papers whose source reports no citation count are "
+            "kept. Omit for no minimum."
+        ),
+    )
     parser.add_argument(
         "--export",
         "-e",
@@ -622,6 +632,7 @@ async def _collect(args: argparse.Namespace):
         year_from=args.year_from,
         year_to=args.year_to,
         top_tier_only=args.top_tier_only,
+        min_citations=args.min_citations,
     )
     _LOG.info("Running search: %s across %s", keywords, ", ".join(sources))
     return await run_search(query, resolve_oa=args.resolve_oa)
diff --git a/thesisagents/core/models.py b/thesisagents/core/models.py
index 067b76f..9d08309 100644
--- a/thesisagents/core/models.py
+++ b/thesisagents/core/models.py
@@ -27,6 +27,25 @@ def _canon_title(title: str) -> str:
     return _TITLE_NOISE_RE.sub("", title.lower())
 
 
+def _canon_author(name: str) -> str:
+    """Canonical surname token of an author name for dedup hashing.
+
+    Sources format the first author differently — ``"Vaswani, Ashish"``
+    (PubMed), ``"Ashish Vaswani"`` (arXiv), ``"A. Vaswani"`` (Crossref). Reduce
+    each to its surname so a DOI-less paper isn't split into duplicates by name
+    formatting: a comma form keeps the part before the comma, otherwise the last
+    whitespace-separated token. All three examples above collapse to
+    ``"vaswani"``.
+    """
+    name = name.strip().lower()
+    if not name:
+        return ""
+    if "," in name:
+        return name.split(",", 1)[0].strip()
+    parts = name.split()
+    return parts[-1] if parts else ""
+
+
 @dataclass(frozen=True, slots=True)
 class RqResult:
     """One research-question evaluation block: question + result table + bullets.
@@ -172,13 +191,16 @@ def dedup_key(self) -> str:
         * Title hashes strip punctuation and whitespace via
           :func:`_canon_title`, so ``"Attention Is All You Need"`` and
           ``"Attention is all you need."`` produce the same key.
+        * The first author is reduced to its surname via
+          :func:`_canon_author`, so ``"Vaswani, Ashish"`` / ``"Ashish
+          Vaswani"`` / ``"A. Vaswani"`` don't split one paper into duplicates.
         """
         if self.doi:
             return f"doi:{self.doi.lower()}"
         if self.arxiv_id:
             arxiv = re.sub(r"v\d+$", "", self.arxiv_id.strip().lower())
             return f"arxiv:{arxiv}"
-        first_author = self.authors[0].strip().lower() if self.authors else ""
+        first_author = _canon_author(self.authors[0]) if self.authors else ""
         seed = f"{_canon_title(self.title)}|{first_author}|{self.year or ''}"
         digest = hashlib.sha256(seed.encode("utf-8"), usedforsecurity=False).hexdigest()
         return f"hash:{digest[:16]}"
diff --git a/thesisagents/core/pipeline.py b/thesisagents/core/pipeline.py
index 7fa8c68..5fe8d7e 100644
--- a/thesisagents/core/pipeline.py
+++ b/thesisagents/core/pipeline.py
@@ -90,6 +90,21 @@ async def run_search(
             "min-citations(>=%d) filter kept %d / %d papers (unknown counts kept)",
             query.min_citations, len(ordered), before,
         )
+    if query.year_from is not None or query.year_to is not None:
+        # Pipeline-level year guard. Most source plugins already filter by year,
+        # but scrape sources (scholar / ieee) do it loosely, so enforce the
+        # range once here for ALL sources. Papers with an unknown year are KEPT
+        # (uncertainty must not silently drop a possibly-in-range paper).
+        before = len(ordered)
+        ordered = [
+            paper
+            for paper in ordered
+            if _in_year_range(paper.year, query.year_from, query.year_to)
+        ]
+        _LOG.info(
+            "year filter [%s..%s] kept %d / %d papers (unknown years kept)",
+            query.year_from or "", query.year_to or "", len(ordered), before,
+        )
     collection = PaperCollection(
         query=query, papers=tuple(ordered[: query.max_results])
     )
@@ -140,6 +155,23 @@ async def _safe_search(fetcher, query: Query) -> list[Paper]:
     return []
 
 
+def _in_year_range(
+    year: int | None, year_from: int | None, year_to: int | None
+) -> bool:
+    """True if ``year`` is within ``[year_from, year_to]`` (None bound = open).
+
+    Unlike the per-source ``in_year_range`` helpers (which drop year-less
+    records), this pipeline guard KEEPS a paper whose year is unknown — at this
+    stage the source already chose to return it, so an unknown year is treated
+    as "possibly in range" rather than silently filtered out.
+    """
+    if year is None:
+        return True
+    if year_from is not None and year < year_from:
+        return False
+    return not (year_to is not None and year > year_to)
+
+
 async def run_single_paper(identifier: PaperIdentifier) -> PaperCollection:
     """Fetch exactly one paper by its identifier and wrap it in a collection.
 

From 6ba99b2ef28b339cba2e0a33652301fc2ba34d35 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 23:17:37 +0800
Subject: [PATCH 3/6] Capture workflow-pass conventions in subagent rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ranking/dedup/filter/concurrency workflow pass introduced two conventions
worth pinning so they don't bit-rot:

- code-quality-reviewer (Async & Concurrency): add the bounded-fan-out rule —
  a stage that gathers one op over N papers (enrich / download / OA resolve)
  escapes the per-source token buckets and must use a module-level Semaphore
  cap. Includes the 25-paper 429 / IP-block rationale + anti-pattern + pattern.

- compliance-auditor (Core vs Source Plugins): add the query-semantics rule —
  Query filter fields (min_citations / year / top_tier) are enforced in the
  pipeline for all sources, never delegated to plugins. Documents the real
  min_citations incident (only semantic_scholar honoured it) + anti-pattern.
---
 .claude/agents/rules/code-quality-reviewer.md | 1 +
 .claude/agents/rules/compliance-auditor.md    | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.claude/agents/rules/code-quality-reviewer.md b/.claude/agents/rules/code-quality-reviewer.md
index 2416438..9cc0774 100644
--- a/.claude/agents/rules/code-quality-reviewer.md
+++ b/.claude/agents/rules/code-quality-reviewer.md
@@ -51,6 +51,7 @@ You do NOT modify files. The parent agent decides whether to fix.
 - Per-source rate limits live in `thesisagents/fetchers/rate_limit.py` as a token-bucket decorator. Each source plugin declares its own bucket (`arxiv: 1 req/3s`, `semantic_scholar: 1 req/s`, `scholar: 1 req/10s with jitter`, etc.). Do NOT bypass the bucket — even retries go through it.
 - Streamlit runs the UI on a separate thread per session. Mutate `st.session_state` only, never module globals. Long-running export jobs are dispatched to the FastAPI backend and polled.
 - All fixture-recording, CLI exports, and tests use `asyncio.run` at the outermost layer and never inside library code.
+- **Bounded fan-out over a paper collection.** A per-source token bucket caps traffic *within* one source, but a stage that fans one operation out over N papers (PDF download, OA resolve, LLM enrich) escapes those buckets — several papers can hit the *same* external API (Anthropic) or publisher CDN (`dl.acm.org`) at once through different source clients. Any `asyncio.gather(*(coro(item) for item in collection.papers))` MUST wrap each task in a module-level `asyncio.Semaphore` cap (the project uses 4–5: `pipeline._ENRICH_CONCURRENCY`, `pdf_download._DOWNLOAD_CONCURRENCY`, `oa_resolver._CONCURRENCY`). **Why:** an unbounded fan-out of a 25-paper search fires 25 concurrent Anthropic calls (an easy 429) or 25 hits on one CDN (an IP block) — it looks fine at 3 papers and fails at 30. **Anti-pattern:** `await asyncio.gather(*(_download_one(p, d) for p in collection.papers))` with no semaphore. **Pattern:** define `sem = asyncio.Semaphore(max(1, concurrency))` and a `_bounded` wrapper that does `async with sem: return await coro(item)`.
 
 ## Security (review-level)
 
diff --git a/.claude/agents/rules/compliance-auditor.md b/.claude/agents/rules/compliance-auditor.md
index 1e5c03c..360e035 100644
--- a/.claude/agents/rules/compliance-auditor.md
+++ b/.claude/agents/rules/compliance-auditor.md
@@ -46,6 +46,8 @@ The line between `thesisagents/` and `sources/<name>/` is **not** "anything sour
 
 **Loading source plugins:** at runtime `thesisagents/fetchers/base.py::load_fetcher(name)` does `importlib.import_module(f"thesisagents.sources.{name}")`. No more `sys.path` injection — sources are a regular sub-package of `thesisagents`. Plugins under `thesisagents/sources/<name>/` use **relative imports** internally (`from .fetcher import …`, `from .parser import …`, `from . import webrunner_backend`). Tests import them via the full path (`from thesisagents.sources.<name>.fetcher import …`) and reference module-string targets the same way (e.g. `monkeypatch.setattr("thesisagents.sources.acm.fetcher.get_client", …)`).
 
+**Query semantics are enforced in core, never delegated to plugins.** Every filter field on `Query` (`min_citations`, `year_from` / `year_to`, `top_tier_only`) MUST be applied by the pipeline (`thesisagents/core/pipeline.py::run_search`, after dedup + rank) so it holds for **all** sources uniformly. A plugin MAY *additionally* push the filter to its upstream API as an optimisation (e.g. `semantic_scholar` sends `minCitationCount`), but core must never *rely* on plugins to do it. **Why (real incident):** `min_citations` was accepted by CLI/MCP/`Query`, documented as a threshold filter, and implemented by only the `semantic_scholar` plugin — so for the other 14 sources it silently did nothing; `min_citations=50` still returned 2-citation arXiv / OpenAlex / DBLP papers. **Anti-pattern:** wiring a `Query` filter into one source's request params and assuming the result set is filtered. **Pattern:** filter in `run_search`, and keep records whose value is *unknown* (`citation_count is None` / `year is None`) rather than treating unknown as failing the threshold (a source not reporting a count must not silently drop the paper).
+
 **When in doubt:** "if a user installs ThesisAgents with the default `requirements.txt` and never enables a source plugin, should this source work?" Yes → core. No → source plugin.
 
 ---

From b6df24fb6493f131dcd82666b0f7a4509d8498cd Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 23:28:31 +0800
Subject: [PATCH 4/6] Resolve SonarCloud PR-17 findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- S2583 (test_pdf_download / test_pipeline concurrency tests): the
  'condition always false' on 'assert 1 <= peak <= 3' was a static-analysis
  false positive — Sonar can't see that 'peak' (a nonlocal int) is mutated by
  the monkeypatched async double via download_pdfs/enrich_collection's gather.
  Switch the counter to a dict so the mutation isn't const-propagated to 0;
  same fix applied to the enrich test pre-emptively.
- S7503 (test_cli min-citations test): the fake_run_search / fake_shutdown
  doubles replace awaited production fns and must stay async — mark NOSONAR.
---
 tests/test_cli.py          |  4 ++--
 tests/test_pdf_download.py | 14 +++++++-------
 tests/test_pipeline.py     | 14 +++++++-------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index a423f4a..0c7aefb 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -325,11 +325,11 @@ def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers
     previously unreachable from the CLI)."""
     captured: dict[str, Query] = {}
 
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
+    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:  # NOSONAR async stub
         captured["query"] = query
         return PaperCollection(query=query, papers=tuple(sample_papers))
 
-    async def fake_shutdown() -> None:
+    async def fake_shutdown() -> None:  # NOSONAR async stub
         return None
 
     monkeypatch.setattr(cli_module, "run_search", fake_run_search)
diff --git a/tests/test_pdf_download.py b/tests/test_pdf_download.py
index a42949f..19ca761 100644
--- a/tests/test_pdf_download.py
+++ b/tests/test_pdf_download.py
@@ -347,15 +347,15 @@ async def test_download_pdfs_caps_concurrency(tmp_path: Path, monkeypatch):
     hammer a single publisher CDN with one request per paper at once."""
     import asyncio
 
-    active = 0
-    peak = 0
+    # dict (not nonlocal ints) so the counter mutation is visible to static
+    # analysers that don't trace the monkeypatched async call path.
+    counters = {"active": 0, "peak": 0}
 
     async def fake_download_one(paper, pdf_dir):
-        nonlocal active, peak
-        active += 1
-        peak = max(peak, active)
+        counters["active"] += 1
+        counters["peak"] = max(counters["peak"], counters["active"])
         await asyncio.sleep(0.01)
-        active -= 1
+        counters["active"] -= 1
         return pdf_download_module.PdfDownloadResult(
             paper_key=paper.source_id, path=None, skipped_reason=None
         )
@@ -364,4 +364,4 @@ async def fake_download_one(paper, pdf_dir):
     papers = [_paper(source_id=str(i), pdf_url="https://e/p.pdf") for i in range(8)]
     results = await download_pdfs(_collection(*papers), tmp_path, concurrency=3)
     assert len(results) == 8
-    assert 1 <= peak <= 3  # never more than the cap in flight at once
+    assert 1 <= counters["peak"] <= 3  # never more than the cap in flight at once
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 304063b..ecbd562 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -326,15 +326,15 @@ async def test_enrich_collection_caps_concurrency(monkeypatch):
 
     from thesisagents.core.models import PaperCollection
 
-    active = 0
-    peak = 0
+    # dict (not nonlocal ints) so the counter mutation is visible to static
+    # analysers that don't trace the monkeypatched async call path.
+    counters = {"active": 0, "peak": 0}
 
     async def fake_enrich_one(paper, *, language, model):
-        nonlocal active, peak
-        active += 1
-        peak = max(peak, active)
+        counters["active"] += 1
+        counters["peak"] = max(counters["peak"], counters["active"])
         await asyncio.sleep(0.01)
-        active -= 1
+        counters["active"] -= 1
         return paper
 
     monkeypatch.setattr(pipeline_module, "_enrich_one", fake_enrich_one)
@@ -345,4 +345,4 @@ async def fake_enrich_one(paper, *, language, model):
     )
     out = await pipeline_module.enrich_collection(collection, concurrency=3)
     assert len(out.papers) == 8
-    assert 1 <= peak <= 3  # never more than the cap in flight at once
+    assert 1 <= counters["peak"] <= 3  # never more than the cap in flight at once

From 98b34d4d4e1b88e7669bed00b1c73ade2d538c86 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 23:41:30 +0800
Subject: [PATCH 5/6] Reduce test_cli.py duplication via shared capture fixture

The per-test fake_run_search / fake_shutdown / monkeypatch boilerplate was the
single biggest duplication source (118 dup lines, ~13% file density). Upgrade
the existing patched_pipeline fixture to also capture the Query, and route the
four query-asserting tests (source-default / exclude-source / min-citations /
top-tier) through it instead of each re-rolling its own fake. -35 lines; the
two async-stub NOSONARs now live once in the fixture.
---
 tests/test_cli.py | 85 ++++++++++++++---------------------------------
 1 file changed, 25 insertions(+), 60 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0c7aefb..f1ab12e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -43,16 +43,28 @@ async def _fake_success(collection, out_dir):
 
 @pytest.fixture()
 def patched_pipeline(monkeypatch, sample_papers):
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
+    """Stub cli's run_search + shutdown_clients and return a dict that captures
+    the Query passed to run_search (``patched_pipeline["query"]``).
+
+    Tests that only need the pipeline stubbed ignore the return value; tests
+    that assert on the constructed Query read ``patched_pipeline["query"]``
+    instead of each re-rolling their own ``fake_run_search`` / ``fake_shutdown``
+    boilerplate.
+    """
+    captured: dict[str, Query] = {}
+
+    async def fake_run_search(query, **_kwargs):  # NOSONAR async stub
+        captured["query"] = query
         return PaperCollection(query=query, papers=tuple(sample_papers))
 
-    async def fake_shutdown() -> None:
+    async def fake_shutdown():  # NOSONAR async stub
         return None
 
     monkeypatch.setattr(cli_module, "run_search", fake_run_search)
     monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
     # Leave the autouse ``_stub_download_pdfs`` in place so the new per-paper
     # PPT gate sees every paper as downloadable.
+    return captured
 
 
 def test_cli_runs_end_to_end(tmp_path: Path, patched_pipeline, capsys):
@@ -248,28 +260,16 @@ def test_cli_rejects_doi_identifier_until_resolver_lands(tmp_path):
     assert code == 2
 
 
-def test_cli_source_default_is_multi_source(tmp_path, monkeypatch, sample_papers):
+def test_cli_source_default_is_multi_source(tmp_path, patched_pipeline):
     """When --source is omitted, run_search must be invoked across the
     DEFAULT_SOURCES mix, not just arxiv."""
     from thesisagents.core.constants import DEFAULT_SOURCES
 
-    captured: dict[str, Query] = {}
-
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
-        captured["query"] = query
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:
-        return None
-
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
-
     code = cli_module.main(
         ["--query", "x", "--out", str(tmp_path), "--export", "bib"]
     )
     assert code == 0
-    assert captured["query"].sources == DEFAULT_SOURCES
+    assert patched_pipeline["query"].sources == DEFAULT_SOURCES
 
 
 def test_cli_list_sources(capsys):
@@ -293,53 +293,30 @@ def test_cli_list_exports(capsys):
     assert "pptx" in out
 
 
-def test_cli_exclude_source_prunes_default_mix(tmp_path, monkeypatch, sample_papers):
+def test_cli_exclude_source_prunes_default_mix(tmp_path, patched_pipeline):
     """--exclude-source subtracts from the resolved mix; the no-VPN path drops
     only ieee and keeps every other default source."""
     from thesisagents.core.constants import DEFAULT_SOURCES
 
-    captured: dict[str, Query] = {}
-
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:  # NOSONAR async stub
-        captured["query"] = query
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:  # NOSONAR async stub
-        return None
-
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
-
     code = cli_module.main(
         ["--query", "x", "--out", str(tmp_path), "--export", "bib",
          "--exclude-source", "ieee"]
     )
     assert code == 0
-    assert "ieee" not in captured["query"].sources
+    assert "ieee" not in patched_pipeline["query"].sources
     expected = tuple(s for s in DEFAULT_SOURCES if s != "ieee")
-    assert captured["query"].sources == expected
+    assert patched_pipeline["query"].sources == expected
 
 
-def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers):
+def test_cli_min_citations_flows_into_query(tmp_path, patched_pipeline):
     """--min-citations is parsed and passed through to the Query (it was
     previously unreachable from the CLI)."""
-    captured: dict[str, Query] = {}
-
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:  # NOSONAR async stub
-        captured["query"] = query
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:  # NOSONAR async stub
-        return None
-
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
     code = cli_module.main(
         ["--query", "x", "--out", str(tmp_path), "--export", "bib",
          "--min-citations", "50"]
     )
     assert code == 0
-    assert captured["query"].min_citations == 50
+    assert patched_pipeline["query"].min_citations == 50
 
 
 def test_cli_exclude_unknown_source_errors(tmp_path):
@@ -359,33 +336,21 @@ def test_cli_exclude_all_sources_errors(tmp_path):
         )
 
 
-def test_cli_top_tier_filter_off_by_default(tmp_path, monkeypatch, sample_papers):
+def test_cli_top_tier_filter_off_by_default(tmp_path, patched_pipeline):
     """top_tier_only is OFF by default (broader coverage including IEEE / ACM
     workshops); --top-tier-only flips it on."""
-    captured: dict[str, Query] = {}
-
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
-        captured["query"] = query
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:
-        return None
-
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
-
     code = cli_module.main(
         ["--query", "x", "--out", str(tmp_path), "--export", "bib"]
     )
     assert code == 0
-    assert captured["query"].top_tier_only is False
+    assert patched_pipeline["query"].top_tier_only is False
 
-    captured.clear()
+    patched_pipeline.clear()
     code = cli_module.main(
         ["--query", "x", "--top-tier-only", "--out", str(tmp_path), "--export", "bib"]
     )
     assert code == 0
-    assert captured["query"].top_tier_only is True
+    assert patched_pipeline["query"].top_tier_only is True
 
 
 def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, sample_papers):

From f44b40ba9511cde18d75de9d03fe9d8bdc5abbc8 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 5 Jun 2026 23:47:45 +0800
Subject: [PATCH 6/6] Reduce test_cli.py duplication further

Route the pdf-download and auto-enrich tests through the shared
patched_pipeline fixture, and delete the now-redundant
_fake_search_with_papers helper (it duplicated the fixture's run_search /
shutdown stub). test_cli.py 926 -> 848 lines across this pass; the only
remaining fake_run_search definitions are the fixture and _patch_search
(which legitimately takes custom papers with controlled pdf_url).
---
 tests/test_cli.py | 65 ++++++++---------------------------------------
 1 file changed, 11 insertions(+), 54 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index f1ab12e..b0233ed 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -353,61 +353,33 @@ def test_cli_top_tier_filter_off_by_default(tmp_path, patched_pipeline):
     assert patched_pipeline["query"].top_tier_only is True
 
 
-def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, sample_papers):
+def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, patched_pipeline):
     """Default flag set should invoke download_pdfs; --no-pdf disables it."""
     calls: list[str] = []
 
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:
-        return None
-
-    async def fake_download(_collection, _out_dir):
+    async def fake_download(_collection, _out_dir):  # NOSONAR async stub
         calls.append("called")
         return []
 
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
     monkeypatch.setattr(cli_module, "download_pdfs", fake_download)
-
     code = cli_module.main(
-        [
-            "--query", "x",
-            "--source", "arxiv",
-            "--out", str(tmp_path),
-            "--export", "bib",
-        ]
+        ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"]
     )
     assert code == 0
     assert calls == ["called"]
 
 
-def test_cli_no_pdf_flag_skips_download(tmp_path, monkeypatch, sample_papers):
+def test_cli_no_pdf_flag_skips_download(tmp_path, monkeypatch, patched_pipeline):
     calls: list[str] = []
 
-    async def fake_run_search(query: Query, **_kwargs) -> PaperCollection:
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown() -> None:
-        return None
-
-    async def fake_download(_collection, _out_dir):
+    async def fake_download(_collection, _out_dir):  # NOSONAR async stub
         calls.append("called")
         return []
 
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
     monkeypatch.setattr(cli_module, "download_pdfs", fake_download)
-
     code = cli_module.main(
-        [
-            "--query", "x",
-            "--source", "arxiv",
-            "--no-pdf",
-            "--out", str(tmp_path),
-            "--export", "bib",
-        ]
+        ["--query", "x", "--source", "arxiv", "--no-pdf",
+         "--out", str(tmp_path), "--export", "bib"]
     )
     assert code == 0
     assert calls == []
@@ -734,21 +706,9 @@ async def fake_enrich(collection, language=None, model=None):  # noqa: ARG001
     return calls
 
 
-def _fake_search_with_papers(monkeypatch, sample_papers):
-    async def fake_run_search(query, **_kwargs):
-        return PaperCollection(query=query, papers=tuple(sample_papers))
-
-    async def fake_shutdown():
-        return None
-
-    monkeypatch.setattr(cli_module, "run_search", fake_run_search)
-    monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown)
-
-
-def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, sample_papers):
+def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, patched_pipeline):
     """ANTHROPIC_API_KEY in env + no --lightweight = auto-enrich fires."""
     monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
-    _fake_search_with_papers(monkeypatch, sample_papers)
     calls = _stub_enrich_collection(monkeypatch)
     code = cli_module.main(
         ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"]
@@ -757,10 +717,9 @@ def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, sample_papers
     assert calls == ["called"]
 
 
-def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, sample_papers):
+def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, patched_pipeline):
     """--lightweight wins over the auto-enrich default."""
     monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
-    _fake_search_with_papers(monkeypatch, sample_papers)
     calls = _stub_enrich_collection(monkeypatch)
     code = cli_module.main(
         [
@@ -773,10 +732,9 @@ def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, sample_papers)
     assert calls == []
 
 
-def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, sample_papers):
+def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, patched_pipeline):
     """No ANTHROPIC_API_KEY → no Anthropic call, lightweight deck."""
     monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
-    _fake_search_with_papers(monkeypatch, sample_papers)
     calls = _stub_enrich_collection(monkeypatch)
     code = cli_module.main(
         ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"]
@@ -785,11 +743,10 @@ def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, sample_papers):
     assert calls == []
 
 
-def test_cli_explicit_enrich_still_works(tmp_path, monkeypatch, sample_papers):
+def test_cli_explicit_enrich_still_works(tmp_path, monkeypatch, patched_pipeline):
     """--enrich runs even without a key in env (the explicit path used to
     error inside the API client; here we only check the CLI dispatch)."""
     monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key")
-    _fake_search_with_papers(monkeypatch, sample_papers)
     calls = _stub_enrich_collection(monkeypatch)
     code = cli_module.main(
         [