Skip to content

Commit b59f719

Browse files
authored
Merge pull request #505 from mrexodia/redundant-fetches
Reduce redundant fetches
2 parents 02e833e + 014eff3 commit b59f719

6 files changed

Lines changed: 296 additions & 18 deletions

File tree

CHANGES.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ Unreleased
1212
backups use the legacy global checkpoint as a migration fallback, and the
1313
legacy file is removed once existing issue/pull backups have resource
1414
checkpoints (#62).
15+
- Stop paginating pull requests during incremental backups once the sorted
16+
results are at or older than the active checkpoint.
17+
- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly
18+
matches the active incremental checkpoint.
19+
- Avoid extra release asset list requests by using asset metadata already
20+
included in GitHub's releases response.
1521
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.
1622

1723

github_backup/github_backup.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers):
717717
return delay + random.uniform(0, delay * 0.1)
718718

719719

720-
def retrieve_data(args, template, query_args=None, paginated=True):
720+
def retrieve_data(args, template, query_args=None, paginated=True, lazy=False):
721721
"""
722722
Fetch the data from GitHub API.
723723
724-
Handle both single requests and pagination with yield of individual dicts.
724+
Handle both single requests and pagination. Returns a list by default, or
725+
a generator when lazy=True so callers can stop before fetching every page.
725726
Handles throttling, retries, read errors, and DMCA takedowns.
726727
"""
727728
query_args = query_args or {}
@@ -851,6 +852,9 @@ def _extract_legal_url(response_body_bytes):
851852
):
852853
break # No more data
853854

855+
if lazy:
856+
return fetch_all()
857+
854858
return list(fetch_all())
855859

856860

@@ -2229,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None):
22292233
if updated_at and (newest_seen is None or updated_at > newest_seen):
22302234
newest_seen = updated_at
22312235

2232-
if since and updated_at and updated_at < since:
2236+
if since and updated_at and updated_at <= since:
22332237
stop = True
22342238
break
22352239

@@ -2650,26 +2654,28 @@ def track_newest_pull_update(pull):
26502654
newest_pull_update = updated_at
26512655

26522656
def pull_is_due_for_repository_checkpoint(pull):
2653-
return not repository_since or pull["updated_at"] >= repository_since
2657+
return not repository_since or pull["updated_at"] > repository_since
26542658

26552659
if not args.include_pull_details:
26562660
pull_states = ["open", "closed"]
26572661
for pull_state in pull_states:
26582662
query_args["state"] = pull_state
2659-
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
2660-
for pull in _pulls:
2663+
for pull in retrieve_data(
2664+
args, _pulls_template, query_args=query_args, lazy=True
2665+
):
26612666
track_newest_pull_update(pull)
2662-
if pulls_since and pull["updated_at"] < pulls_since:
2667+
if pulls_since and pull["updated_at"] <= pulls_since:
26632668
break
2664-
if not pulls_since or pull["updated_at"] >= pulls_since:
2669+
if not pulls_since or pull["updated_at"] > pulls_since:
26652670
pulls[pull["number"]] = pull
26662671
else:
2667-
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
2668-
for pull in _pulls:
2672+
for pull in retrieve_data(
2673+
args, _pulls_template, query_args=query_args, lazy=True
2674+
):
26692675
track_newest_pull_update(pull)
2670-
if pulls_since and pull["updated_at"] < pulls_since:
2676+
if pulls_since and pull["updated_at"] <= pulls_since:
26712677
break
2672-
if not pulls_since or pull["updated_at"] >= pulls_since:
2678+
if not pulls_since or pull["updated_at"] > pulls_since:
26732679
if pull_is_due_for_repository_checkpoint(pull):
26742680
pulls[pull["number"]] = retrieve_data(
26752681
args,
@@ -2913,7 +2919,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
29132919
written_count += 1
29142920

29152921
if include_assets and not skip_assets:
2916-
assets = retrieve_data(args, release["assets_url"])
2922+
# The releases list API already includes release asset metadata. Use
2923+
# it to avoid an extra /releases/{id}/assets request per release.
2924+
# Keep a fallback for older/enterprise responses that might omit it.
2925+
assets = release.get("assets")
2926+
if assets is None:
2927+
assets = retrieve_data(args, release["assets_url"])
29172928
if len(assets) > 0:
29182929
# give release asset files somewhere to live & download them (not including source archives)
29192930
release_assets_cwd = os.path.join(release_cwd, release_name_safe)

tests/test_discussions.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args):
5050
)
5151

5252

53+
def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args):
54+
args = create_args()
55+
repository = {"full_name": "owner/repo"}
56+
57+
page = {
58+
"repository": {
59+
"hasDiscussionsEnabled": True,
60+
"discussions": {
61+
"totalCount": 1,
62+
"nodes": [
63+
{
64+
"number": 1,
65+
"title": "already backed up",
66+
"updatedAt": "2026-01-01T00:00:00Z",
67+
},
68+
],
69+
"pageInfo": {"hasNextPage": True, "endCursor": "NEXT"},
70+
},
71+
}
72+
}
73+
74+
with patch(
75+
"github_backup.github_backup.retrieve_graphql_data", return_value=page
76+
) as mock_retrieve:
77+
summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
78+
args, repository, since="2026-01-01T00:00:00Z"
79+
)
80+
81+
assert enabled is True
82+
assert total == 1
83+
assert newest == "2026-01-01T00:00:00Z"
84+
assert summaries == []
85+
assert mock_retrieve.call_count == 1
86+
87+
5388
def test_retrieve_discussion_summaries_disabled_discussions(create_args):
5489
args = create_args()
5590
repository = {"full_name": "owner/repo"}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Tests for incremental pull request pagination."""
2+
3+
import json
4+
import os
5+
from unittest.mock import patch
6+
7+
from github_backup import github_backup
8+
9+
10+
class MockHTTPResponse:
11+
def __init__(self, data, link_header=None):
12+
self._content = json.dumps(data).encode("utf-8")
13+
self._link_header = link_header
14+
self._read = False
15+
self.reason = "OK"
16+
17+
def getcode(self):
18+
return 200
19+
20+
def read(self):
21+
if self._read:
22+
return b""
23+
self._read = True
24+
return self._content
25+
26+
@property
27+
def headers(self):
28+
headers = {"x-ratelimit-remaining": "5000"}
29+
if self._link_header:
30+
headers["Link"] = self._link_header
31+
return headers
32+
33+
34+
def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path):
35+
args = create_args(include_pulls=True, incremental=True)
36+
args.since = "2026-04-26T08:13:46Z"
37+
repository = {"full_name": "owner/repo"}
38+
39+
responses = [
40+
MockHTTPResponse([]),
41+
MockHTTPResponse(
42+
[
43+
{
44+
"number": 1,
45+
"title": "already backed up",
46+
"updated_at": "2026-04-26T08:13:46Z",
47+
},
48+
],
49+
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
50+
),
51+
MockHTTPResponse(
52+
[
53+
{
54+
"number": 0,
55+
"title": "older pull on page 2",
56+
"updated_at": "2026-04-25T07:00:00Z",
57+
}
58+
]
59+
),
60+
]
61+
requests_made = []
62+
63+
def mock_urlopen(request, *args, **kwargs):
64+
requests_made.append(request.get_full_url())
65+
return responses[len(requests_made) - 1]
66+
67+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
68+
github_backup.backup_pulls(
69+
args, tmp_path, repository, "https://api.github.com/repos"
70+
)
71+
72+
assert len(requests_made) == 2
73+
assert "state=open" in requests_made[0]
74+
assert "state=closed" in requests_made[1]
75+
assert all("page=2" not in url for url in requests_made)
76+
assert not os.path.exists(tmp_path / "pulls" / "1.json")
77+
assert not os.path.exists(tmp_path / "pulls" / "0.json")
78+
79+
80+
def test_backup_pulls_incremental_stops_before_fetching_old_pages(
81+
create_args, tmp_path
82+
):
83+
args = create_args(include_pulls=True, incremental=True)
84+
args.since = "2026-04-26T08:13:46Z"
85+
repository = {"full_name": "owner/repo"}
86+
87+
responses = [
88+
MockHTTPResponse([]),
89+
MockHTTPResponse(
90+
[
91+
{
92+
"number": 2,
93+
"title": "new pull",
94+
"updated_at": "2026-04-26T09:00:00Z",
95+
},
96+
{
97+
"number": 1,
98+
"title": "old pull",
99+
"updated_at": "2026-04-26T07:00:00Z",
100+
},
101+
],
102+
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
103+
),
104+
MockHTTPResponse(
105+
[
106+
{
107+
"number": 0,
108+
"title": "older pull on page 2",
109+
"updated_at": "2026-04-25T07:00:00Z",
110+
}
111+
]
112+
),
113+
]
114+
requests_made = []
115+
116+
def mock_urlopen(request, *args, **kwargs):
117+
requests_made.append(request.get_full_url())
118+
return responses[len(requests_made) - 1]
119+
120+
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
121+
github_backup.backup_pulls(
122+
args, tmp_path, repository, "https://api.github.com/repos"
123+
)
124+
125+
assert len(requests_made) == 2
126+
assert "state=open" in requests_made[0]
127+
assert "state=closed" in requests_made[1]
128+
assert all("page=2" not in url for url in requests_made)
129+
assert os.path.exists(tmp_path / "pulls" / "2.json")
130+
assert not os.path.exists(tmp_path / "pulls" / "1.json")
131+
assert not os.path.exists(tmp_path / "pulls" / "0.json")

tests/test_pull_reviews.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch):
1616
repository = {"full_name": "owner/repo"}
1717
calls = []
1818

19-
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
19+
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
2020
calls.append((template, query_args))
2121
if template == "https://api.github.com/repos/owner/repo/pulls":
2222
if query_args["state"] == "open":
@@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint(
7373
args.since = "2026-01-01T00:00:00Z"
7474
repository = {"full_name": "owner/repo"}
7575

76-
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
76+
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
7777
if template == "https://api.github.com/repos/owner/repo/pulls":
7878
if query_args["state"] == "open":
7979
return [
@@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi
117117
pulls_dir.mkdir()
118118
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
119119

120-
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
120+
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
121121
if template == "https://api.github.com/repos/owner/repo/pulls":
122122
if query_args["state"] == "open":
123123
return [
@@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data(
169169
f,
170170
)
171171

172-
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
172+
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
173173
if template == "https://api.github.com/repos/owner/repo/pulls":
174174
if query_args["state"] == "open":
175175
return [
@@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error(
213213
pulls_dir.mkdir()
214214
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
215215

216-
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
216+
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
217217
if template == "https://api.github.com/repos/owner/repo/pulls":
218218
if query_args["state"] == "open":
219219
return [

0 commit comments

Comments
 (0)