Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ Unreleased
backups use the legacy global checkpoint as a migration fallback, and the
legacy file is removed once existing issue/pull backups have resource
checkpoints (#62).
- Stop paginating pull requests during incremental backups once the sorted
results are at or older than the active checkpoint.
- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly
matches the active incremental checkpoint.
- Avoid extra release asset list requests by using asset metadata already
included in GitHub's releases response.
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.


Expand Down
37 changes: 24 additions & 13 deletions github_backup/github_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers):
return delay + random.uniform(0, delay * 0.1)


def retrieve_data(args, template, query_args=None, paginated=True):
def retrieve_data(args, template, query_args=None, paginated=True, lazy=False):
"""
Fetch the data from GitHub API.

Handle both single requests and pagination with yield of individual dicts.
Handle both single requests and pagination. Returns a list by default, or
a generator when lazy=True so callers can stop before fetching every page.
Handles throttling, retries, read errors, and DMCA takedowns.
"""
query_args = query_args or {}
Expand Down Expand Up @@ -851,6 +852,9 @@ def _extract_legal_url(response_body_bytes):
):
break # No more data

if lazy:
return fetch_all()

return list(fetch_all())


Expand Down Expand Up @@ -2229,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None):
if updated_at and (newest_seen is None or updated_at > newest_seen):
newest_seen = updated_at

if since and updated_at and updated_at < since:
if since and updated_at and updated_at <= since:
stop = True
break

Expand Down Expand Up @@ -2650,26 +2654,28 @@ def track_newest_pull_update(pull):
newest_pull_update = updated_at

def pull_is_due_for_repository_checkpoint(pull):
return not repository_since or pull["updated_at"] >= repository_since
return not repository_since or pull["updated_at"] > repository_since

if not args.include_pull_details:
pull_states = ["open", "closed"]
for pull_state in pull_states:
query_args["state"] = pull_state
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
for pull in retrieve_data(
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
if pulls_since and pull["updated_at"] <= pulls_since:
break
if not pulls_since or pull["updated_at"] >= pulls_since:
if not pulls_since or pull["updated_at"] > pulls_since:
pulls[pull["number"]] = pull
else:
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
for pull in retrieve_data(
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
if pulls_since and pull["updated_at"] <= pulls_since:
break
if not pulls_since or pull["updated_at"] >= pulls_since:
if not pulls_since or pull["updated_at"] > pulls_since:
if pull_is_due_for_repository_checkpoint(pull):
pulls[pull["number"]] = retrieve_data(
args,
Expand Down Expand Up @@ -2913,7 +2919,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
written_count += 1

if include_assets and not skip_assets:
assets = retrieve_data(args, release["assets_url"])
# The releases list API already includes release asset metadata. Use
# it to avoid an extra /releases/{id}/assets request per release.
# Keep a fallback for older/enterprise responses that might omit it.
assets = release.get("assets")
if assets is None:
assets = retrieve_data(args, release["assets_url"])
if len(assets) > 0:
# give release asset files somewhere to live & download them (not including source archives)
release_assets_cwd = os.path.join(release_cwd, release_name_safe)
Expand Down
35 changes: 35 additions & 0 deletions tests/test_discussions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args):
)


def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args):
args = create_args()
repository = {"full_name": "owner/repo"}

page = {
"repository": {
"hasDiscussionsEnabled": True,
"discussions": {
"totalCount": 1,
"nodes": [
{
"number": 1,
"title": "already backed up",
"updatedAt": "2026-01-01T00:00:00Z",
},
],
"pageInfo": {"hasNextPage": True, "endCursor": "NEXT"},
},
}
}

with patch(
"github_backup.github_backup.retrieve_graphql_data", return_value=page
) as mock_retrieve:
summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
args, repository, since="2026-01-01T00:00:00Z"
)

assert enabled is True
assert total == 1
assert newest == "2026-01-01T00:00:00Z"
assert summaries == []
assert mock_retrieve.call_count == 1


def test_retrieve_discussion_summaries_disabled_discussions(create_args):
args = create_args()
repository = {"full_name": "owner/repo"}
Expand Down
131 changes: 131 additions & 0 deletions tests/test_pull_incremental_pagination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Tests for incremental pull request pagination."""

import json
import os
from unittest.mock import patch

from github_backup import github_backup


class MockHTTPResponse:
def __init__(self, data, link_header=None):
self._content = json.dumps(data).encode("utf-8")
self._link_header = link_header
self._read = False
self.reason = "OK"

def getcode(self):
return 200

def read(self):
if self._read:
return b""
self._read = True
return self._content

@property
def headers(self):
headers = {"x-ratelimit-remaining": "5000"}
if self._link_header:
headers["Link"] = self._link_header
return headers


def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path):
args = create_args(include_pulls=True, incremental=True)
args.since = "2026-04-26T08:13:46Z"
repository = {"full_name": "owner/repo"}

responses = [
MockHTTPResponse([]),
MockHTTPResponse(
[
{
"number": 1,
"title": "already backed up",
"updated_at": "2026-04-26T08:13:46Z",
},
],
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
),
MockHTTPResponse(
[
{
"number": 0,
"title": "older pull on page 2",
"updated_at": "2026-04-25T07:00:00Z",
}
]
),
]
requests_made = []

def mock_urlopen(request, *args, **kwargs):
requests_made.append(request.get_full_url())
return responses[len(requests_made) - 1]

with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)

assert len(requests_made) == 2
assert "state=open" in requests_made[0]
assert "state=closed" in requests_made[1]
assert all("page=2" not in url for url in requests_made)
assert not os.path.exists(tmp_path / "pulls" / "1.json")
assert not os.path.exists(tmp_path / "pulls" / "0.json")


def test_backup_pulls_incremental_stops_before_fetching_old_pages(
create_args, tmp_path
):
args = create_args(include_pulls=True, incremental=True)
args.since = "2026-04-26T08:13:46Z"
repository = {"full_name": "owner/repo"}

responses = [
MockHTTPResponse([]),
MockHTTPResponse(
[
{
"number": 2,
"title": "new pull",
"updated_at": "2026-04-26T09:00:00Z",
},
{
"number": 1,
"title": "old pull",
"updated_at": "2026-04-26T07:00:00Z",
},
],
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
),
MockHTTPResponse(
[
{
"number": 0,
"title": "older pull on page 2",
"updated_at": "2026-04-25T07:00:00Z",
}
]
),
]
requests_made = []

def mock_urlopen(request, *args, **kwargs):
requests_made.append(request.get_full_url())
return responses[len(requests_made) - 1]

with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)

assert len(requests_made) == 2
assert "state=open" in requests_made[0]
assert "state=closed" in requests_made[1]
assert all("page=2" not in url for url in requests_made)
assert os.path.exists(tmp_path / "pulls" / "2.json")
assert not os.path.exists(tmp_path / "pulls" / "1.json")
assert not os.path.exists(tmp_path / "pulls" / "0.json")
10 changes: 5 additions & 5 deletions tests/test_pull_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch):
repository = {"full_name": "owner/repo"}
calls = []

def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
calls.append((template, query_args))
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint(
args.since = "2026-01-01T00:00:00Z"
repository = {"full_name": "owner/repo"}

def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
Expand Down Expand Up @@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")

def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data(
f,
)

def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
Expand Down Expand Up @@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error(
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")

def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
Expand Down
Loading