From 006adfbaba514e61aa1604193d0b5b05ba27f1dd Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 22:59:28 +0800 Subject: [PATCH 1/6] Improve the automatic paper-research workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six focused quality / robustness fixes across the search -> dedup -> filter -> enrich -> download pipeline. Search quality - ranking: add a query-relevance axis. Dedup merges each source's locally relevance-sorted list and discarded that ordering, so ranking fell back to recency + citation alone — surfacing off-topic-but-highly-cited papers above on-topic ones. Now score = relevance (query-term overlap with title >> abstract, dominant) + recency + damped log10(citations). keywords is optional so single-paper callers keep the old behaviour. - dedup: normalise the title hash (strip punctuation/whitespace) and drop the arXiv version suffix, so "Attention Is All You Need" / "...need." and 2401.00001v1 / v2 collapse to one record instead of splitting. Filter correctness - min_citations was accepted (CLI/MCP/Query) and documented as a filter but only ever applied by the semantic_scholar plugin. Enforce it in the pipeline for every source; papers whose source reports no count are kept (an unknown count is not treated as zero). Concurrency robustness - enrich_collection: cap simultaneous per-paper enrichments with a semaphore (default 4) so a large collection doesn't fire one Anthropic call + one PDF download per paper at once and trip the API rate limit. - download_pdfs: same semaphore cap, so several sources handing back PDF URLs on the same publisher CDN don't burst it into an IP block. Cleanup - pipeline._enrich_one: drop the redundant `except (ThesisAgentsError, Exception)` (Exception already covers it). Gates: 588 tests pass (+10), ruff + bandit clean. --- tests/test_dedup_ranking.py | 72 +++++++++++++++++++++++++ tests/test_pdf_download.py | 25 +++++++++ tests/test_pipeline.py | 54 +++++++++++++++++++ thesisagents/core/models.py | 30 +++++++++-- thesisagents/core/pdf_download.py | 25 +++++++-- thesisagents/core/pipeline.py | 46 ++++++++++++++-- thesisagents/core/ranking.py | 89 ++++++++++++++++++++++++++----- thesisagents/mcp/server.py | 6 ++- 8 files changed, 322 insertions(+), 25 deletions(-) diff --git a/tests/test_dedup_ranking.py b/tests/test_dedup_ranking.py index 5cddb3e..da7e3d9 100644 --- a/tests/test_dedup_ranking.py +++ b/tests/test_dedup_ranking.py @@ -82,6 +82,23 @@ def test_dedupe_merges_multiple_optional_fields(): assert out[0].year == 2024 +def test_dedupe_collapses_title_punctuation_and_case(): + """Same paper from two sources differing only by trailing period / case + collapses to one record (no DOI / arXiv ID to key on).""" + a = _paper(source_id="1", title="Attention Is All You Need", + authors=("Vaswani",), year=2017) + b = _paper(source_id="2", title="Attention is all you need.", + authors=("Vaswani",), year=2017) + assert len(dedupe([a, b])) == 1 + + +def test_dedupe_collapses_arxiv_versions(): + """Two arXiv versions of the same paper (v1, v2) are one paper.""" + v1 = _paper(source_id="1", arxiv_id="2401.00001v1") + v2 = _paper(source_id="2", arxiv_id="2401.00001v2") + assert len(dedupe([v1, v2])) == 1 + + def test_rank_prefers_recent_papers(): old = _paper(source_id="old", year=2010) new = _paper(source_id="new", year=2024) @@ -101,3 +118,58 @@ def test_rank_handles_missing_year(): b = _paper(source_id="b", year=2024) ordered = rank([a, b], current_year=2025) assert ordered[0].source_id == "b" + + +def test_rank_relevance_outranks_off_topic_high_citation(): + """An on-topic paper beats an off-topic, far-more-cited one — relevance is + the dominant signal (the failure this whole feature prevents).""" + on_topic = _paper( + source_id="on", title="A Survey of Transformer Attention", + year=2024, citation_count=10, + ) + off_topic = _paper( + source_id="off", title="Deep Residual Learning for Image Recognition", + year=2024, citation_count=100_000, + ) + ordered = rank( + [off_topic, on_topic], keywords="transformer attention", current_year=2025 + ) + assert ordered[0].source_id == "on" + + +def test_rank_title_match_outranks_abstract_match(): + """A query term in the title weighs more than the same term in the abstract.""" + title_match = _paper( + source_id="title", title="Transformer Attention Models", + abstract="", year=2024, + ) + abstract_match = _paper( + source_id="abs", title="A New Method", + abstract="we study transformer attention", year=2024, + ) + ordered = rank( + [abstract_match, title_match], + keywords="transformer attention", current_year=2025, + ) + assert ordered[0].source_id == "title" + + +def test_rank_without_keywords_ignores_relevance(): + """No keywords -> relevance axis off; back-compat recency+citation only.""" + on_topic_but_old = _paper( + source_id="on", title="transformer attention", year=2010, citation_count=1, + ) + recent_unrelated = _paper( + source_id="recent", title="unrelated topic", year=2024, citation_count=1, + ) + ordered = rank([on_topic_but_old, recent_unrelated], current_year=2025) + assert ordered[0].source_id == "recent" # recency wins; title text ignored + + +def test_rank_short_query_terms_are_dropped(): + """Stop-word-ish terms (<3 chars) don't create spurious matches.""" + # "a"/"of" must not match; only "llm" (3 chars) counts. + hit = _paper(source_id="hit", title="On LLM agents", year=2024) + miss = _paper(source_id="miss", title="A study of birds", year=2024) + ordered = rank([miss, hit], keywords="a llm of", current_year=2025) + assert ordered[0].source_id == "hit" diff --git a/tests/test_pdf_download.py b/tests/test_pdf_download.py index 191db99..a42949f 100644 --- a/tests/test_pdf_download.py +++ b/tests/test_pdf_download.py @@ -340,3 +340,28 @@ async def test_cookies_only_attach_to_matching_host( cookie_header = transport.request_headers[0].get("cookie", "") assert "SECRET" not in cookie_header assert "do-not-leak" not in cookie_header + + +async def test_download_pdfs_caps_concurrency(tmp_path: Path, monkeypatch): + """A semaphore bounds simultaneous downloads so a big collection doesn't + hammer a single publisher CDN with one request per paper at once.""" + import asyncio + + active = 0 + peak = 0 + + async def fake_download_one(paper, pdf_dir): + nonlocal active, peak + active += 1 + peak = max(peak, active) + await asyncio.sleep(0.01) + active -= 1 + return pdf_download_module.PdfDownloadResult( + paper_key=paper.source_id, path=None, skipped_reason=None + ) + + monkeypatch.setattr(pdf_download_module, "_download_one", fake_download_one) + papers = [_paper(source_id=str(i), pdf_url="https://e/p.pdf") for i in range(8)] + results = await download_pdfs(_collection(*papers), tmp_path, concurrency=3) + assert len(results) == 8 + assert 1 <= peak <= 3 # never more than the cap in flight at once diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 0f7a376..f5404a8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -268,3 +268,57 @@ async def test_run_search_top_tier_only_default_false(monkeypatch): collection = await pipeline_module.run_search(query) assert len(collection.papers) == 1 assert collection.papers[0].title == "Low paper" + + +async def test_run_search_min_citations_filters_all_sources(monkeypatch): + """min_citations is enforced in the pipeline for every source (not only + semantic_scholar). Papers with an unknown (None) citation count are kept.""" + def _cited(source_id, count): + return Paper( + source="arxiv", source_id=source_id, title=source_id, authors=(), + year=2024, venue=None, abstract="", + url=f"https://example.com/{source_id}", citation_count=count, + ) + + high = _cited("high", 100) + low = _cited("low", 5) + unknown = _cited("unknown", None) + pool = {"arxiv": _make_fetcher("arxiv", [high, low, unknown])} + monkeypatch.setattr(pipeline_module, "load_fetcher", lambda name: pool[name]) + query = Query( + keywords="x", sources=("arxiv",), max_results=10, min_citations=50 + ) + collection = await pipeline_module.run_search(query, resolve_oa=False) + ids = {p.source_id for p in collection.papers} + assert "high" in ids # 100 >= 50 + assert "unknown" in ids # unknown count kept, not dropped + assert "low" not in ids # 5 < 50 filtered out + + +async def test_enrich_collection_caps_concurrency(monkeypatch): + """A semaphore bounds how many per-paper enrichments run at once, so a big + collection doesn't fire one Anthropic call + one PDF download per paper.""" + import asyncio + + from thesisagents.core.models import PaperCollection + + active = 0 + peak = 0 + + async def fake_enrich_one(paper, *, language, model): + nonlocal active, peak + active += 1 + peak = max(peak, active) + await asyncio.sleep(0.01) + active -= 1 + return paper + + monkeypatch.setattr(pipeline_module, "_enrich_one", fake_enrich_one) + papers = tuple(_paper("arxiv", str(i), f"t{i}") for i in range(8)) + collection = PaperCollection( + query=Query(keywords="x", sources=("arxiv",), max_results=8), + papers=papers, + ) + out = await pipeline_module.enrich_collection(collection, concurrency=3) + assert len(out.papers) == 8 + assert 1 <= peak <= 3 # never more than the cap in flight at once diff --git a/thesisagents/core/models.py b/thesisagents/core/models.py index 913d61f..067b76f 100644 --- a/thesisagents/core/models.py +++ b/thesisagents/core/models.py @@ -3,6 +3,7 @@ from __future__ import annotations import hashlib +import re from dataclasses import dataclass, field, replace from typing import Any @@ -12,6 +13,19 @@ MAX_RESULTS_PER_SOURCE, ) +_TITLE_NOISE_RE = re.compile(r"[^a-z0-9]+") + + +def _canon_title(title: str) -> str: + """Canonical title form for dedup hashing: lowercase with every + non-alphanumeric run (punctuation, whitespace) stripped. + + ``"Attention Is All You Need"``, ``"Attention is all you need."`` and + ``"Attention—Is All You Need"`` all map to ``"attentionisallyouneed"``, + so cross-source punctuation/spacing noise collapses into one dedup key. + """ + return _TITLE_NOISE_RE.sub("", title.lower()) + @dataclass(frozen=True, slots=True) class RqResult: @@ -149,13 +163,23 @@ def dedup_key(self) -> str: Why: Google Scholar and Semantic Scholar may return the same paper with different source-side IDs. DOI is best; arXiv ID next; otherwise hash title + first-author + year. + + Each key component is *normalised* so cosmetic cross-source differences + don't split one paper into two records: + + * arXiv IDs drop the version suffix — ``2401.00001v1`` and + ``2401.00001v2`` are the same paper (a revision), so they collapse. + * Title hashes strip punctuation and whitespace via + :func:`_canon_title`, so ``"Attention Is All You Need"`` and + ``"Attention is all you need."`` produce the same key. """ if self.doi: return f"doi:{self.doi.lower()}" if self.arxiv_id: - return f"arxiv:{self.arxiv_id.lower()}" - first_author = self.authors[0].lower() if self.authors else "" - seed = f"{self.title.strip().lower()}|{first_author}|{self.year or ''}" + arxiv = re.sub(r"v\d+$", "", self.arxiv_id.strip().lower()) + return f"arxiv:{arxiv}" + first_author = self.authors[0].strip().lower() if self.authors else "" + seed = f"{_canon_title(self.title)}|{first_author}|{self.year or ''}" digest = hashlib.sha256(seed.encode("utf-8"), usedforsecurity=False).hexdigest() return f"hash:{digest[:16]}" diff --git a/thesisagents/core/pdf_download.py b/thesisagents/core/pdf_download.py index e08532c..87f0c9e 100644 --- a/thesisagents/core/pdf_download.py +++ b/thesisagents/core/pdf_download.py @@ -33,6 +33,13 @@ _MAX_PDF_BYTES: int = 50_000_000 _MAX_FILENAME_LENGTH: int = 80 +#: Cap on simultaneously-downloading PDFs. Each per-source client already has +#: its own token bucket, but several sources can hand back PDF URLs on the SAME +#: publisher CDN (dl.acm.org, link.springer.com, …) under different source +#: clients — bypassing any single bucket. A global cap keeps that burst from +#: looking like a scrape and risking an IP block. 4 keeps the stage brisk. +_DOWNLOAD_CONCURRENCY: int = 4 + #: Real browser User-Agent — many publisher CDNs (Elsevier, IEEE Xplore, #: Springer) return 403 to non-browser UAs even for open-access PDFs. _BROWSER_UA = ( @@ -52,17 +59,29 @@ class PdfDownloadResult: async def download_pdfs( - collection: PaperCollection, out_dir: str | Path + collection: PaperCollection, + out_dir: str | Path, + *, + concurrency: int = _DOWNLOAD_CONCURRENCY, ) -> list[PdfDownloadResult]: """Download every paper's PDF into ``{out_dir}/pdfs/``. Returns a result per paper so callers can summarise failures. Papers - without a ``pdf_url`` are skipped with reason ``no_pdf_url``. + without a ``pdf_url`` are skipped with reason ``no_pdf_url``. A semaphore + caps how many downloads run at once (``concurrency``, default + ``_DOWNLOAD_CONCURRENCY``) so a large collection doesn't hammer a single + publisher CDN with one request per paper simultaneously. """ root = ensure_export_dir(out_dir) pdf_dir = ensure_export_dir(root / _PDF_SUBDIR) + sem = asyncio.Semaphore(max(1, concurrency)) + + async def _bounded(paper: Paper) -> PdfDownloadResult: + async with sem: + return await _download_one(paper, pdf_dir) + results = await asyncio.gather( - *(_download_one(paper, pdf_dir) for paper in collection.papers) + *(_bounded(paper) for paper in collection.papers) ) return list(results) diff --git a/thesisagents/core/pipeline.py b/thesisagents/core/pipeline.py index 48821f0..7fa8c68 100644 --- a/thesisagents/core/pipeline.py +++ b/thesisagents/core/pipeline.py @@ -31,6 +31,12 @@ _LOG = get_logger(__name__) +# Cap on simultaneous per-paper enrichment tasks. Each task fetches a PDF over +# HTTPS and then calls the Anthropic API; without a cap a 25-paper search would +# fire 25 concurrent API calls (an easy 429 from Anthropic) plus 25 concurrent +# PDF downloads. 4 keeps throughput high while staying under typical API limits. +_ENRICH_CONCURRENCY = 4 + async def run_search( query: Query, *, resolve_oa: bool = True @@ -60,13 +66,30 @@ async def run_search( for source_papers in results: flat.extend(source_papers) unique = dedupe(flat) - ordered = rank(unique) + ordered = rank(unique, keywords=query.keywords) if query.top_tier_only: before = len(ordered) ordered = [paper for paper in ordered if is_top_tier(paper)] _LOG.info( "top-tier filter kept %d / %d papers", len(ordered), before ) + if query.min_citations is not None: + # Apply min_citations across EVERY source here, not just the one source + # (semantic_scholar) that supports it as an API parameter. Papers whose + # source doesn't report a citation count (citation_count is None — e.g. + # dblp / doaj / hal / arxiv) are KEPT: an unknown count must not be + # treated as zero and silently dropped. + before = len(ordered) + ordered = [ + paper + for paper in ordered + if paper.citation_count is None + or paper.citation_count >= query.min_citations + ] + _LOG.info( + "min-citations(>=%d) filter kept %d / %d papers (unknown counts kept)", + query.min_citations, len(ordered), before, + ) collection = PaperCollection( query=query, papers=tuple(ordered[: query.max_results]) ) @@ -140,16 +163,29 @@ async def enrich_collection( *, language: str = "en", model: str | None = None, + concurrency: int = _ENRICH_CONCURRENCY, ) -> PaperCollection: """Download each paper's PDF, summarise it with an LLM, attach the result. Papers without a ``pdf_url`` or whose PDF can't be fetched / parsed pass through unchanged — the exporter then falls back to the abstract-based - deck. Each enrichment runs in its own asyncio task so total wall-clock - is roughly ``max(per-paper time)`` rather than the sum. + deck. Enrichments run concurrently but a semaphore caps how many run at + once (``concurrency``, default ``_ENRICH_CONCURRENCY``) so a large + collection doesn't fire one Anthropic API call + one PDF download per paper + all at once and trip the API's rate limit. Total wall-clock is roughly + ``ceil(n / concurrency) * per-paper time``. + + Example: ``enrich_collection(coll, concurrency=2)`` processes at most two + papers in flight at a time. """ + sem = asyncio.Semaphore(max(1, concurrency)) + + async def _bounded(paper: Paper) -> Paper: + async with sem: + return await _enrich_one(paper, language=language, model=model) + enriched_papers = await asyncio.gather( - *(_enrich_one(paper, language=language, model=model) for paper in collection.papers) + *(_bounded(paper) for paper in collection.papers) ) return PaperCollection(query=collection.query, papers=tuple(enriched_papers)) @@ -179,7 +215,7 @@ async def _enrich_one( summary = await asyncio.to_thread( summarise_paper, paper, pdf, language=language, model=model ) - except (ThesisAgentsError, Exception) as err: # noqa: BLE001 # API client raises various types + except Exception as err: # noqa: BLE001 # anthropic client raises various, incl. ThesisAgentsError _LOG.warning("summarisation failed for %s: %s", paper.bibtex_key(), err) return paper if summary.is_empty(): diff --git a/thesisagents/core/ranking.py b/thesisagents/core/ranking.py index 79fd130..32a4dc3 100644 --- a/thesisagents/core/ranking.py +++ b/thesisagents/core/ranking.py @@ -1,34 +1,99 @@ -"""Simple relevance + recency + citation ranking. +"""Relevance + recency + citation ranking. -The signal is intentionally lightweight — sources already return roughly -relevance-sorted lists. We just stable-merge them and tie-break with recency -and citation count. +Each source returns a roughly relevance-sorted list, but de-duplication merges +those lists across sources and discards each source's local ordering. Without an +explicit relevance signal the merged list would sort purely by recency + +citation — which buries a highly-relevant older paper under a recent, more-cited, +*off-topic* one. (Concrete failure this prevents: a search for "transformer +attention" that surfaces a 100k-citation ResNet paper above an on-topic survey, +simply because ResNet is cited more.) + +So each paper is scored on three axes and stable-sorted by the sum: + +* **relevance** (dominant) — overlap between the query keywords and the paper's + title (weighted heavily) + abstract (weighted lightly). Research starts from + "is this on my topic?", so an on-topic paper should beat an off-topic one even + when the off-topic one is older-and-more-cited. +* **recency** — exponential decay over paper age (~5-year scale). +* **citation** — ``log10`` of the citation count (diminishing returns), then + damped by ``_CITATION_WEIGHT`` so a huge citation count is a strong tie-break + but cannot by itself outrank an on-topic title. + +``keywords`` is optional: callers that rank a single fetched paper (no query, e.g. +``run_single_paper``) pass ``None`` and keep the pre-relevance recency+citation +behaviour. """ from __future__ import annotations import math +import re from collections.abc import Iterable from thesisagents.core.models import Paper _CURRENT_YEAR_FALLBACK = 2026 +# Relevance weights. Title overlap dominates; abstract overlap is a softer +# secondary signal. Tuned together with _CITATION_WEIGHT so that a full +# title match (= _TITLE_WEIGHT) outranks even a ~100k-citation off-topic paper +# (citation term ≈ _CITATION_WEIGHT * log10(1e5) = 0.4 * 5 = 2.0 < 3.0). +_TITLE_WEIGHT = 3.0 +_ABSTRACT_WEIGHT = 0.6 +# Damping on the log10 citation term. Keeps "most-cited" a meaningful tie-break +# among similarly-relevant papers without letting it swamp the topic signal. +_CITATION_WEIGHT = 0.4 +# Query/title tokens shorter than this are dropped as stop-word-ish noise +# ("a", "of", "is", "the"). 3 keeps useful short terms like "llm", "rag", "gan". +_MIN_TERM_LEN = 3 + +_WORD_RE = re.compile(r"[a-z0-9]+") -def rank(papers: Iterable[Paper], current_year: int | None = None) -> list[Paper]: - """Stable sort by descending composite score.""" + +def rank( + papers: Iterable[Paper], + keywords: str | None = None, + current_year: int | None = None, +) -> list[Paper]: + """Stable sort by descending composite score (relevance + recency + citation). + + ``keywords`` is the raw query string; when given, papers whose title / + abstract share terms with it rank higher. ``None`` disables the relevance + axis (single-paper / query-less callers). + """ year_base = current_year or _CURRENT_YEAR_FALLBACK + terms = _terms(keywords) if keywords else frozenset() return sorted( papers, - key=lambda paper: _score(paper, year_base), + key=lambda paper: _score(paper, year_base, terms), reverse=True, ) -def _score(paper: Paper, current_year: int) -> float: - recency = _recency_score(paper.year, current_year) - citation = _citation_score(paper.citation_count) - return recency + citation +def _score(paper: Paper, current_year: int, terms: frozenset[str]) -> float: + return ( + _relevance_score(paper, terms) + + _recency_score(paper.year, current_year) + + _citation_score(paper.citation_count) + ) + + +def _terms(text: str) -> frozenset[str]: + """Lowercase alphanumeric tokens of length >= ``_MIN_TERM_LEN``.""" + return frozenset(w for w in _WORD_RE.findall(text.lower()) if len(w) >= _MIN_TERM_LEN) + + +def _relevance_score(paper: Paper, terms: frozenset[str]) -> float: + """Fraction of query terms appearing in title / abstract, title-weighted. + + Range ``[0, _TITLE_WEIGHT + _ABSTRACT_WEIGHT]``. ``0.0`` when no query terms + were supplied, so the score reduces to recency + citation. + """ + if not terms: + return 0.0 + title_hit = len(terms & _terms(paper.title)) / len(terms) + abstract_hit = len(terms & _terms(paper.abstract)) / len(terms) + return title_hit * _TITLE_WEIGHT + abstract_hit * _ABSTRACT_WEIGHT def _recency_score(year: int | None, current_year: int) -> float: @@ -41,4 +106,4 @@ def _recency_score(year: int | None, current_year: int) -> float: def _citation_score(citations: int | None) -> float: if citations is None or citations <= 0: return 0.0 - return math.log10(citations + 1.0) + return _CITATION_WEIGHT * math.log10(citations + 1.0) diff --git a/thesisagents/mcp/server.py b/thesisagents/mcp/server.py index 3c4f340..d86465f 100644 --- a/thesisagents/mcp/server.py +++ b/thesisagents/mcp/server.py @@ -274,8 +274,10 @@ async def search( conferences + Nature / Science / PNAS / CACM / LNCS). arXiv preprints always pass through. Pass ``False`` for a broader net. - ``min_citations`` filters out papers below the threshold; pass - ``None`` for no minimum. + ``min_citations`` filters out papers whose citation count is below + the threshold — enforced across every source in the pipeline, with + papers whose source reports no count kept (not treated as zero). + Pass ``None`` for no minimum. Returns a JSON-serialisable dict with a `papers` list. Each paper has the same fields as Paper.to_dict() — pass the list straight to `export`. From aa3390b1a4a5500a81c2e2d09487d49b8a689c36 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 23:12:12 +0800 Subject: [PATCH 2/6] Further paper-research workflow improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continuation of the workflow-quality pass. - dedup: canonicalise the first-author surname so name-format differences ("Vaswani, Ashish" / "Ashish Vaswani" / "A. Vaswani") don't split one DOI-less paper into duplicates — complements the title/arXiv-version canon. - pipeline: enforce the year range for ALL sources after dedup/rank, not only inside the source plugins. Scrape sources (scholar / ieee) filter loosely, so a pipeline guard keeps the result within [year_from, year_to]; papers with an unknown year are kept. - cli: expose --min-citations. The flag and the Query wiring were both missing, so the filter (now pipeline-wide) was unreachable from the CLI. docs/cli.md updated. Gates: 592 tests pass (+4), ruff + bandit clean. --- docs/cli.md | 3 ++- tests/test_cli.py | 22 ++++++++++++++++++++++ tests/test_dedup_ranking.py | 20 ++++++++++++++++++++ tests/test_pipeline.py | 24 ++++++++++++++++++++++++ thesisagents/cli.py | 11 +++++++++++ thesisagents/core/models.py | 24 +++++++++++++++++++++++- thesisagents/core/pipeline.py | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 134 insertions(+), 2 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 0b05874..f06c014 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -15,7 +15,7 @@ exclusive modes: thesisagents (--query KEYWORDS | --paper IDENTIFIER) [--source SOURCES] [--exclude-source SOURCES] [--max N] - [--year-from YEAR] [--year-to YEAR] + [--year-from YEAR] [--year-to YEAR] [--min-citations N] [--export FORMATS] [--out DIR] [--filename-stem STEM] @@ -39,6 +39,7 @@ thesisagents (--query KEYWORDS | --paper IDENTIFIER) | `--exclude-source` / `-x` | — | Comma-separated sources to remove from the mix, subtracted **after** `--source` resolves. The no-VPN gesture is to leave `--source` at its default and pass `--exclude-source ieee` — every other default source stays in. An unknown name (or excluding the whole mix) is an error. | | `--max` / `-n` | `25` | Range 1..200. | | `--year-from`, `--year-to` | — | Inclusive year filter. | +| `--min-citations` | — | Drop papers below this citation count, enforced across **all** sources (not just Semantic Scholar). Papers whose source reports no count are kept. Omit for no minimum. | | `--export` / `-e` | mode-specific | Any of `pptx`, `xlsx`, `md`, `bib`, `json`, `ris`, `csv`, `csl`. **Default with `--query` is `pptx,xlsx,bib`; default with `--paper` is `pptx,bib`** (one-row Excel is busy work). Explicit `--export` always wins. `ris` (Zotero/Mendeley/EndNote), `csv` (flat table) and `csl` (CSL-JSON for Pandoc, written as `.csl.json`) are the interchange formats. | | `--list-sources`, `--list-exports` | — | Print the available search sources / export formats and exit (no query needed). | | `--out` / `-o` | `./exports` | Created if missing. | diff --git a/tests/test_cli.py b/tests/test_cli.py index b8c2ed1..a423f4a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -320,6 +320,28 @@ async def fake_shutdown() -> None: # NOSONAR async stub assert captured["query"].sources == expected +def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers): + """--min-citations is parsed and passed through to the Query (it was + previously unreachable from the CLI).""" + captured: dict[str, Query] = {} + + async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: + captured["query"] = query + return PaperCollection(query=query, papers=tuple(sample_papers)) + + async def fake_shutdown() -> None: + return None + + monkeypatch.setattr(cli_module, "run_search", fake_run_search) + monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) + code = cli_module.main( + ["--query", "x", "--out", str(tmp_path), "--export", "bib", + "--min-citations", "50"] + ) + assert code == 0 + assert captured["query"].min_citations == 50 + + def test_cli_exclude_unknown_source_errors(tmp_path): """A typo in --exclude-source must fail loudly, not silently no-op.""" with pytest.raises(SystemExit): diff --git a/tests/test_dedup_ranking.py b/tests/test_dedup_ranking.py index da7e3d9..2a761dc 100644 --- a/tests/test_dedup_ranking.py +++ b/tests/test_dedup_ranking.py @@ -99,6 +99,26 @@ def test_dedupe_collapses_arxiv_versions(): assert len(dedupe([v1, v2])) == 1 +def test_dedupe_collapses_author_name_formats(): + """Same DOI-less paper whose first author arrives in three different name + formats across sources collapses to one record (surname canonicalisation).""" + a = _paper(source_id="1", title="On Method X", + authors=("Ashish Vaswani",), year=2020) + b = _paper(source_id="2", title="On Method X", + authors=("Vaswani, Ashish",), year=2020) + c = _paper(source_id="3", title="On Method X", + authors=("A. Vaswani",), year=2020) + assert len(dedupe([a, b, c])) == 1 + + +def test_dedupe_distinguishes_different_surnames(): + """Different first-author surnames (same title/year) stay distinct — the + surname canonicalisation must not over-merge.""" + a = _paper(source_id="1", title="Common Title", authors=("Alice Smith",), year=2020) + b = _paper(source_id="2", title="Common Title", authors=("Bob Jones",), year=2020) + assert len(dedupe([a, b])) == 2 + + def test_rank_prefers_recent_papers(): old = _paper(source_id="old", year=2010) new = _paper(source_id="new", year=2024) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index f5404a8..304063b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -295,6 +295,30 @@ def _cited(source_id, count): assert "low" not in ids # 5 < 50 filtered out +async def test_run_search_year_range_pipeline_guard(monkeypatch): + """The pipeline enforces the year range even when a source returns + out-of-range papers (scrape sources filter loosely); unknown years kept.""" + def _yr(sid, year): + return Paper( + source="arxiv", source_id=sid, title=sid, authors=(), year=year, + venue=None, abstract="", url=f"https://example.com/{sid}", + ) + + in_range = _yr("in", 2022) + too_old = _yr("old", 2010) + too_new = _yr("new", 2025) + unknown = _yr("unk", None) + pool = {"arxiv": _make_fetcher("arxiv", [in_range, too_old, too_new, unknown])} + monkeypatch.setattr(pipeline_module, "load_fetcher", lambda name: pool[name]) + query = Query( + keywords="x", sources=("arxiv",), max_results=10, + year_from=2020, year_to=2024, + ) + collection = await pipeline_module.run_search(query, resolve_oa=False) + ids = {p.source_id for p in collection.papers} + assert ids == {"in", "unk"} # in-range + unknown kept; old/new dropped + + async def test_enrich_collection_caps_concurrency(monkeypatch): """A semaphore bounds how many per-paper enrichments run at once, so a big collection doesn't fire one Anthropic call + one PDF download per paper.""" diff --git a/thesisagents/cli.py b/thesisagents/cli.py index 99161d3..67797d9 100644 --- a/thesisagents/cli.py +++ b/thesisagents/cli.py @@ -184,6 +184,16 @@ def build_parser() -> argparse.ArgumentParser: default=None, help="Latest publication year (inclusive).", ) + parser.add_argument( + "--min-citations", + type=int, + default=None, + help=( + "Drop papers with fewer than this many citations (enforced across " + "all sources). Papers whose source reports no citation count are " + "kept. Omit for no minimum." + ), + ) parser.add_argument( "--export", "-e", @@ -622,6 +632,7 @@ async def _collect(args: argparse.Namespace): year_from=args.year_from, year_to=args.year_to, top_tier_only=args.top_tier_only, + min_citations=args.min_citations, ) _LOG.info("Running search: %s across %s", keywords, ", ".join(sources)) return await run_search(query, resolve_oa=args.resolve_oa) diff --git a/thesisagents/core/models.py b/thesisagents/core/models.py index 067b76f..9d08309 100644 --- a/thesisagents/core/models.py +++ b/thesisagents/core/models.py @@ -27,6 +27,25 @@ def _canon_title(title: str) -> str: return _TITLE_NOISE_RE.sub("", title.lower()) +def _canon_author(name: str) -> str: + """Canonical surname token of an author name for dedup hashing. + + Sources format the first author differently — ``"Vaswani, Ashish"`` + (PubMed), ``"Ashish Vaswani"`` (arXiv), ``"A. Vaswani"`` (Crossref). Reduce + each to its surname so a DOI-less paper isn't split into duplicates by name + formatting: a comma form keeps the part before the comma, otherwise the last + whitespace-separated token. All three examples above collapse to + ``"vaswani"``. + """ + name = name.strip().lower() + if not name: + return "" + if "," in name: + return name.split(",", 1)[0].strip() + parts = name.split() + return parts[-1] if parts else "" + + @dataclass(frozen=True, slots=True) class RqResult: """One research-question evaluation block: question + result table + bullets. @@ -172,13 +191,16 @@ def dedup_key(self) -> str: * Title hashes strip punctuation and whitespace via :func:`_canon_title`, so ``"Attention Is All You Need"`` and ``"Attention is all you need."`` produce the same key. + * The first author is reduced to its surname via + :func:`_canon_author`, so ``"Vaswani, Ashish"`` / ``"Ashish + Vaswani"`` / ``"A. Vaswani"`` don't split one paper into duplicates. """ if self.doi: return f"doi:{self.doi.lower()}" if self.arxiv_id: arxiv = re.sub(r"v\d+$", "", self.arxiv_id.strip().lower()) return f"arxiv:{arxiv}" - first_author = self.authors[0].strip().lower() if self.authors else "" + first_author = _canon_author(self.authors[0]) if self.authors else "" seed = f"{_canon_title(self.title)}|{first_author}|{self.year or ''}" digest = hashlib.sha256(seed.encode("utf-8"), usedforsecurity=False).hexdigest() return f"hash:{digest[:16]}" diff --git a/thesisagents/core/pipeline.py b/thesisagents/core/pipeline.py index 7fa8c68..5fe8d7e 100644 --- a/thesisagents/core/pipeline.py +++ b/thesisagents/core/pipeline.py @@ -90,6 +90,21 @@ async def run_search( "min-citations(>=%d) filter kept %d / %d papers (unknown counts kept)", query.min_citations, len(ordered), before, ) + if query.year_from is not None or query.year_to is not None: + # Pipeline-level year guard. Most source plugins already filter by year, + # but scrape sources (scholar / ieee) do it loosely, so enforce the + # range once here for ALL sources. Papers with an unknown year are KEPT + # (uncertainty must not silently drop a possibly-in-range paper). + before = len(ordered) + ordered = [ + paper + for paper in ordered + if _in_year_range(paper.year, query.year_from, query.year_to) + ] + _LOG.info( + "year filter [%s..%s] kept %d / %d papers (unknown years kept)", + query.year_from or "", query.year_to or "", len(ordered), before, + ) collection = PaperCollection( query=query, papers=tuple(ordered[: query.max_results]) ) @@ -140,6 +155,23 @@ async def _safe_search(fetcher, query: Query) -> list[Paper]: return [] +def _in_year_range( + year: int | None, year_from: int | None, year_to: int | None +) -> bool: + """True if ``year`` is within ``[year_from, year_to]`` (None bound = open). + + Unlike the per-source ``in_year_range`` helpers (which drop year-less + records), this pipeline guard KEEPS a paper whose year is unknown — at this + stage the source already chose to return it, so an unknown year is treated + as "possibly in range" rather than silently filtered out. + """ + if year is None: + return True + if year_from is not None and year < year_from: + return False + return not (year_to is not None and year > year_to) + + async def run_single_paper(identifier: PaperIdentifier) -> PaperCollection: """Fetch exactly one paper by its identifier and wrap it in a collection. From 6ba99b2ef28b339cba2e0a33652301fc2ba34d35 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 23:17:37 +0800 Subject: [PATCH 3/6] Capture workflow-pass conventions in subagent rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ranking/dedup/filter/concurrency workflow pass introduced two conventions worth pinning so they don't bit-rot: - code-quality-reviewer (Async & Concurrency): add the bounded-fan-out rule — a stage that gathers one op over N papers (enrich / download / OA resolve) escapes the per-source token buckets and must use a module-level Semaphore cap. Includes the 25-paper 429 / IP-block rationale + anti-pattern + pattern. - compliance-auditor (Core vs Source Plugins): add the query-semantics rule — Query filter fields (min_citations / year / top_tier) are enforced in the pipeline for all sources, never delegated to plugins. Documents the real min_citations incident (only semantic_scholar honoured it) + anti-pattern. --- .claude/agents/rules/code-quality-reviewer.md | 1 + .claude/agents/rules/compliance-auditor.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.claude/agents/rules/code-quality-reviewer.md b/.claude/agents/rules/code-quality-reviewer.md index 2416438..9cc0774 100644 --- a/.claude/agents/rules/code-quality-reviewer.md +++ b/.claude/agents/rules/code-quality-reviewer.md @@ -51,6 +51,7 @@ You do NOT modify files. The parent agent decides whether to fix. - Per-source rate limits live in `thesisagents/fetchers/rate_limit.py` as a token-bucket decorator. Each source plugin declares its own bucket (`arxiv: 1 req/3s`, `semantic_scholar: 1 req/s`, `scholar: 1 req/10s with jitter`, etc.). Do NOT bypass the bucket — even retries go through it. - Streamlit runs the UI on a separate thread per session. Mutate `st.session_state` only, never module globals. Long-running export jobs are dispatched to the FastAPI backend and polled. - All fixture-recording, CLI exports, and tests use `asyncio.run` at the outermost layer and never inside library code. +- **Bounded fan-out over a paper collection.** A per-source token bucket caps traffic *within* one source, but a stage that fans one operation out over N papers (PDF download, OA resolve, LLM enrich) escapes those buckets — several papers can hit the *same* external API (Anthropic) or publisher CDN (`dl.acm.org`) at once through different source clients. Any `asyncio.gather(*(coro(item) for item in collection.papers))` MUST wrap each task in a module-level `asyncio.Semaphore` cap (the project uses 4–5: `pipeline._ENRICH_CONCURRENCY`, `pdf_download._DOWNLOAD_CONCURRENCY`, `oa_resolver._CONCURRENCY`). **Why:** an unbounded fan-out of a 25-paper search fires 25 concurrent Anthropic calls (an easy 429) or 25 hits on one CDN (an IP block) — it looks fine at 3 papers and fails at 30. **Anti-pattern:** `await asyncio.gather(*(_download_one(p, d) for p in collection.papers))` with no semaphore. **Pattern:** define `sem = asyncio.Semaphore(max(1, concurrency))` and a `_bounded` wrapper that does `async with sem: return await coro(item)`. ## Security (review-level) diff --git a/.claude/agents/rules/compliance-auditor.md b/.claude/agents/rules/compliance-auditor.md index 1e5c03c..360e035 100644 --- a/.claude/agents/rules/compliance-auditor.md +++ b/.claude/agents/rules/compliance-auditor.md @@ -46,6 +46,8 @@ The line between `thesisagents/` and `sources//` is **not** "anything sour **Loading source plugins:** at runtime `thesisagents/fetchers/base.py::load_fetcher(name)` does `importlib.import_module(f"thesisagents.sources.{name}")`. No more `sys.path` injection — sources are a regular sub-package of `thesisagents`. Plugins under `thesisagents/sources//` use **relative imports** internally (`from .fetcher import …`, `from .parser import …`, `from . import webrunner_backend`). Tests import them via the full path (`from thesisagents.sources..fetcher import …`) and reference module-string targets the same way (e.g. `monkeypatch.setattr("thesisagents.sources.acm.fetcher.get_client", …)`). +**Query semantics are enforced in core, never delegated to plugins.** Every filter field on `Query` (`min_citations`, `year_from` / `year_to`, `top_tier_only`) MUST be applied by the pipeline (`thesisagents/core/pipeline.py::run_search`, after dedup + rank) so it holds for **all** sources uniformly. A plugin MAY *additionally* push the filter to its upstream API as an optimisation (e.g. `semantic_scholar` sends `minCitationCount`), but core must never *rely* on plugins to do it. **Why (real incident):** `min_citations` was accepted by CLI/MCP/`Query`, documented as a threshold filter, and implemented by only the `semantic_scholar` plugin — so for the other 14 sources it silently did nothing; `min_citations=50` still returned 2-citation arXiv / OpenAlex / DBLP papers. **Anti-pattern:** wiring a `Query` filter into one source's request params and assuming the result set is filtered. **Pattern:** filter in `run_search`, and keep records whose value is *unknown* (`citation_count is None` / `year is None`) rather than treating unknown as failing the threshold (a source not reporting a count must not silently drop the paper). + **When in doubt:** "if a user installs ThesisAgents with the default `requirements.txt` and never enables a source plugin, should this source work?" Yes → core. No → source plugin. --- From b6df24fb6493f131dcd82666b0f7a4509d8498cd Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 23:28:31 +0800 Subject: [PATCH 4/6] Resolve SonarCloud PR-17 findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - S2583 (test_pdf_download / test_pipeline concurrency tests): the 'condition always false' on 'assert 1 <= peak <= 3' was a static-analysis false positive — Sonar can't see that 'peak' (a nonlocal int) is mutated by the monkeypatched async double via download_pdfs/enrich_collection's gather. Switch the counter to a dict so the mutation isn't const-propagated to 0; same fix applied to the enrich test pre-emptively. - S7503 (test_cli min-citations test): the fake_run_search / fake_shutdown doubles replace awaited production fns and must stay async — mark NOSONAR. --- tests/test_cli.py | 4 ++-- tests/test_pdf_download.py | 14 +++++++------- tests/test_pipeline.py | 14 +++++++------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index a423f4a..0c7aefb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -325,11 +325,11 @@ def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers previously unreachable from the CLI).""" captured: dict[str, Query] = {} - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: + async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: # NOSONAR async stub captured["query"] = query return PaperCollection(query=query, papers=tuple(sample_papers)) - async def fake_shutdown() -> None: + async def fake_shutdown() -> None: # NOSONAR async stub return None monkeypatch.setattr(cli_module, "run_search", fake_run_search) diff --git a/tests/test_pdf_download.py b/tests/test_pdf_download.py index a42949f..19ca761 100644 --- a/tests/test_pdf_download.py +++ b/tests/test_pdf_download.py @@ -347,15 +347,15 @@ async def test_download_pdfs_caps_concurrency(tmp_path: Path, monkeypatch): hammer a single publisher CDN with one request per paper at once.""" import asyncio - active = 0 - peak = 0 + # dict (not nonlocal ints) so the counter mutation is visible to static + # analysers that don't trace the monkeypatched async call path. + counters = {"active": 0, "peak": 0} async def fake_download_one(paper, pdf_dir): - nonlocal active, peak - active += 1 - peak = max(peak, active) + counters["active"] += 1 + counters["peak"] = max(counters["peak"], counters["active"]) await asyncio.sleep(0.01) - active -= 1 + counters["active"] -= 1 return pdf_download_module.PdfDownloadResult( paper_key=paper.source_id, path=None, skipped_reason=None ) @@ -364,4 +364,4 @@ async def fake_download_one(paper, pdf_dir): papers = [_paper(source_id=str(i), pdf_url="https://e/p.pdf") for i in range(8)] results = await download_pdfs(_collection(*papers), tmp_path, concurrency=3) assert len(results) == 8 - assert 1 <= peak <= 3 # never more than the cap in flight at once + assert 1 <= counters["peak"] <= 3 # never more than the cap in flight at once diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 304063b..ecbd562 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -326,15 +326,15 @@ async def test_enrich_collection_caps_concurrency(monkeypatch): from thesisagents.core.models import PaperCollection - active = 0 - peak = 0 + # dict (not nonlocal ints) so the counter mutation is visible to static + # analysers that don't trace the monkeypatched async call path. + counters = {"active": 0, "peak": 0} async def fake_enrich_one(paper, *, language, model): - nonlocal active, peak - active += 1 - peak = max(peak, active) + counters["active"] += 1 + counters["peak"] = max(counters["peak"], counters["active"]) await asyncio.sleep(0.01) - active -= 1 + counters["active"] -= 1 return paper monkeypatch.setattr(pipeline_module, "_enrich_one", fake_enrich_one) @@ -345,4 +345,4 @@ async def fake_enrich_one(paper, *, language, model): ) out = await pipeline_module.enrich_collection(collection, concurrency=3) assert len(out.papers) == 8 - assert 1 <= peak <= 3 # never more than the cap in flight at once + assert 1 <= counters["peak"] <= 3 # never more than the cap in flight at once From 98b34d4d4e1b88e7669bed00b1c73ade2d538c86 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 23:41:30 +0800 Subject: [PATCH 5/6] Reduce test_cli.py duplication via shared capture fixture The per-test fake_run_search / fake_shutdown / monkeypatch boilerplate was the single biggest duplication source (118 dup lines, ~13% file density). Upgrade the existing patched_pipeline fixture to also capture the Query, and route the four query-asserting tests (source-default / exclude-source / min-citations / top-tier) through it instead of each re-rolling its own fake. -35 lines; the two async-stub NOSONARs now live once in the fixture. --- tests/test_cli.py | 85 ++++++++++++++--------------------------------- 1 file changed, 25 insertions(+), 60 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0c7aefb..f1ab12e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -43,16 +43,28 @@ async def _fake_success(collection, out_dir): @pytest.fixture() def patched_pipeline(monkeypatch, sample_papers): - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: + """Stub cli's run_search + shutdown_clients and return a dict that captures + the Query passed to run_search (``patched_pipeline["query"]``). + + Tests that only need the pipeline stubbed ignore the return value; tests + that assert on the constructed Query read ``patched_pipeline["query"]`` + instead of each re-rolling their own ``fake_run_search`` / ``fake_shutdown`` + boilerplate. + """ + captured: dict[str, Query] = {} + + async def fake_run_search(query, **_kwargs): # NOSONAR async stub + captured["query"] = query return PaperCollection(query=query, papers=tuple(sample_papers)) - async def fake_shutdown() -> None: + async def fake_shutdown(): # NOSONAR async stub return None monkeypatch.setattr(cli_module, "run_search", fake_run_search) monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) # Leave the autouse ``_stub_download_pdfs`` in place so the new per-paper # PPT gate sees every paper as downloadable. + return captured def test_cli_runs_end_to_end(tmp_path: Path, patched_pipeline, capsys): @@ -248,28 +260,16 @@ def test_cli_rejects_doi_identifier_until_resolver_lands(tmp_path): assert code == 2 -def test_cli_source_default_is_multi_source(tmp_path, monkeypatch, sample_papers): +def test_cli_source_default_is_multi_source(tmp_path, patched_pipeline): """When --source is omitted, run_search must be invoked across the DEFAULT_SOURCES mix, not just arxiv.""" from thesisagents.core.constants import DEFAULT_SOURCES - captured: dict[str, Query] = {} - - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: - captured["query"] = query - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: - return None - - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) - code = cli_module.main( ["--query", "x", "--out", str(tmp_path), "--export", "bib"] ) assert code == 0 - assert captured["query"].sources == DEFAULT_SOURCES + assert patched_pipeline["query"].sources == DEFAULT_SOURCES def test_cli_list_sources(capsys): @@ -293,53 +293,30 @@ def test_cli_list_exports(capsys): assert "pptx" in out -def test_cli_exclude_source_prunes_default_mix(tmp_path, monkeypatch, sample_papers): +def test_cli_exclude_source_prunes_default_mix(tmp_path, patched_pipeline): """--exclude-source subtracts from the resolved mix; the no-VPN path drops only ieee and keeps every other default source.""" from thesisagents.core.constants import DEFAULT_SOURCES - captured: dict[str, Query] = {} - - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: # NOSONAR async stub - captured["query"] = query - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: # NOSONAR async stub - return None - - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) - code = cli_module.main( ["--query", "x", "--out", str(tmp_path), "--export", "bib", "--exclude-source", "ieee"] ) assert code == 0 - assert "ieee" not in captured["query"].sources + assert "ieee" not in patched_pipeline["query"].sources expected = tuple(s for s in DEFAULT_SOURCES if s != "ieee") - assert captured["query"].sources == expected + assert patched_pipeline["query"].sources == expected -def test_cli_min_citations_flows_into_query(tmp_path, monkeypatch, sample_papers): +def test_cli_min_citations_flows_into_query(tmp_path, patched_pipeline): """--min-citations is parsed and passed through to the Query (it was previously unreachable from the CLI).""" - captured: dict[str, Query] = {} - - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: # NOSONAR async stub - captured["query"] = query - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: # NOSONAR async stub - return None - - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) code = cli_module.main( ["--query", "x", "--out", str(tmp_path), "--export", "bib", "--min-citations", "50"] ) assert code == 0 - assert captured["query"].min_citations == 50 + assert patched_pipeline["query"].min_citations == 50 def test_cli_exclude_unknown_source_errors(tmp_path): @@ -359,33 +336,21 @@ def test_cli_exclude_all_sources_errors(tmp_path): ) -def test_cli_top_tier_filter_off_by_default(tmp_path, monkeypatch, sample_papers): +def test_cli_top_tier_filter_off_by_default(tmp_path, patched_pipeline): """top_tier_only is OFF by default (broader coverage including IEEE / ACM workshops); --top-tier-only flips it on.""" - captured: dict[str, Query] = {} - - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: - captured["query"] = query - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: - return None - - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) - code = cli_module.main( ["--query", "x", "--out", str(tmp_path), "--export", "bib"] ) assert code == 0 - assert captured["query"].top_tier_only is False + assert patched_pipeline["query"].top_tier_only is False - captured.clear() + patched_pipeline.clear() code = cli_module.main( ["--query", "x", "--top-tier-only", "--out", str(tmp_path), "--export", "bib"] ) assert code == 0 - assert captured["query"].top_tier_only is True + assert patched_pipeline["query"].top_tier_only is True def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, sample_papers): From f44b40ba9511cde18d75de9d03fe9d8bdc5abbc8 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 5 Jun 2026 23:47:45 +0800 Subject: [PATCH 6/6] Reduce test_cli.py duplication further Route the pdf-download and auto-enrich tests through the shared patched_pipeline fixture, and delete the now-redundant _fake_search_with_papers helper (it duplicated the fixture's run_search / shutdown stub). test_cli.py 926 -> 848 lines across this pass; the only remaining fake_run_search definitions are the fixture and _patch_search (which legitimately takes custom papers with controlled pdf_url). --- tests/test_cli.py | 65 ++++++++--------------------------------------- 1 file changed, 11 insertions(+), 54 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index f1ab12e..b0233ed 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -353,61 +353,33 @@ def test_cli_top_tier_filter_off_by_default(tmp_path, patched_pipeline): assert patched_pipeline["query"].top_tier_only is True -def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, sample_papers): +def test_cli_default_triggers_pdf_download(tmp_path, monkeypatch, patched_pipeline): """Default flag set should invoke download_pdfs; --no-pdf disables it.""" calls: list[str] = [] - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: - return None - - async def fake_download(_collection, _out_dir): + async def fake_download(_collection, _out_dir): # NOSONAR async stub calls.append("called") return [] - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) monkeypatch.setattr(cli_module, "download_pdfs", fake_download) - code = cli_module.main( - [ - "--query", "x", - "--source", "arxiv", - "--out", str(tmp_path), - "--export", "bib", - ] + ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"] ) assert code == 0 assert calls == ["called"] -def test_cli_no_pdf_flag_skips_download(tmp_path, monkeypatch, sample_papers): +def test_cli_no_pdf_flag_skips_download(tmp_path, monkeypatch, patched_pipeline): calls: list[str] = [] - async def fake_run_search(query: Query, **_kwargs) -> PaperCollection: - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown() -> None: - return None - - async def fake_download(_collection, _out_dir): + async def fake_download(_collection, _out_dir): # NOSONAR async stub calls.append("called") return [] - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) monkeypatch.setattr(cli_module, "download_pdfs", fake_download) - code = cli_module.main( - [ - "--query", "x", - "--source", "arxiv", - "--no-pdf", - "--out", str(tmp_path), - "--export", "bib", - ] + ["--query", "x", "--source", "arxiv", "--no-pdf", + "--out", str(tmp_path), "--export", "bib"] ) assert code == 0 assert calls == [] @@ -734,21 +706,9 @@ async def fake_enrich(collection, language=None, model=None): # noqa: ARG001 return calls -def _fake_search_with_papers(monkeypatch, sample_papers): - async def fake_run_search(query, **_kwargs): - return PaperCollection(query=query, papers=tuple(sample_papers)) - - async def fake_shutdown(): - return None - - monkeypatch.setattr(cli_module, "run_search", fake_run_search) - monkeypatch.setattr(cli_module, "shutdown_clients", fake_shutdown) - - -def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, sample_papers): +def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, patched_pipeline): """ANTHROPIC_API_KEY in env + no --lightweight = auto-enrich fires.""" monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") - _fake_search_with_papers(monkeypatch, sample_papers) calls = _stub_enrich_collection(monkeypatch) code = cli_module.main( ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"] @@ -757,10 +717,9 @@ def test_cli_auto_enriches_when_api_key_set(tmp_path, monkeypatch, sample_papers assert calls == ["called"] -def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, sample_papers): +def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, patched_pipeline): """--lightweight wins over the auto-enrich default.""" monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") - _fake_search_with_papers(monkeypatch, sample_papers) calls = _stub_enrich_collection(monkeypatch) code = cli_module.main( [ @@ -773,10 +732,9 @@ def test_cli_lightweight_skips_auto_enrich(tmp_path, monkeypatch, sample_papers) assert calls == [] -def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, sample_papers): +def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, patched_pipeline): """No ANTHROPIC_API_KEY → no Anthropic call, lightweight deck.""" monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) - _fake_search_with_papers(monkeypatch, sample_papers) calls = _stub_enrich_collection(monkeypatch) code = cli_module.main( ["--query", "x", "--source", "arxiv", "--out", str(tmp_path), "--export", "bib"] @@ -785,11 +743,10 @@ def test_cli_no_key_does_not_auto_enrich(tmp_path, monkeypatch, sample_papers): assert calls == [] -def test_cli_explicit_enrich_still_works(tmp_path, monkeypatch, sample_papers): +def test_cli_explicit_enrich_still_works(tmp_path, monkeypatch, patched_pipeline): """--enrich runs even without a key in env (the explicit path used to error inside the API client; here we only check the CLI dispatch).""" monkeypatch.setenv("ANTHROPIC_API_KEY", "test-key") - _fake_search_with_papers(monkeypatch, sample_papers) calls = _stub_enrich_collection(monkeypatch) code = cli_module.main( [