From 86ab4d1f3b19dddcdd2c811a67d2d70af2414419 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 19:14:54 +0200 Subject: [PATCH 1/6] Add bundle-ready UK release manifests --- .github/workflows/pull_request.yaml | 19 ++ .../add-bundle-release-manifest.changed.md | 1 + .../tests/test_release_manifest.py | 70 +++++++ policyengine_uk_data/utils/data_upload.py | 82 ++++++++- .../utils/release_manifest.py | 171 +++++++++++++++--- 5 files changed, 308 insertions(+), 35 deletions(-) create mode 100644 changelog.d/add-bundle-release-manifest.changed.md diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index 56d90e9f3..497e826ee 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -27,6 +27,25 @@ jobs: pip install "ruff>=0.9.0" - name: Check formatting run: ruff format --check . + bundle-release-manifest-contract: + name: Validate bundle release manifest contract + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: 3.13 + - name: Install uv + uses: astral-sh/setup-uv@v8.1.0 + - name: Install package + run: uv pip install --system . + - name: Install bundle validation tooling + # Pin the test-only bundle contract dependency until policyengine-bundles + # has published releases suitable for ordinary dependency specifiers. + run: uv pip install --system pytest "policyengine-bundles @ git+https://github.com/PolicyEngine/policyengine-bundles@8ae9f56fefcf89f69b8a7e3bc49928509c6207be" + - name: Validate release manifest contract + run: python -m pytest policyengine_uk_data/tests/test_release_manifest.py::test_build_release_manifest_validates_against_bundle_contract test: name: Test runs-on: ubuntu-latest diff --git a/changelog.d/add-bundle-release-manifest.changed.md b/changelog.d/add-bundle-release-manifest.changed.md new file mode 100644 index 000000000..cec2124bc --- /dev/null +++ b/changelog.d/add-bundle-release-manifest.changed.md @@ -0,0 +1 @@ +Added bundle-compatible metadata to UK data release manifests. diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index e42e95dc8..036679450 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -1,6 +1,7 @@ import hashlib from io import BytesIO from importlib import metadata +import json from pathlib import Path from unittest.mock import MagicMock, patch @@ -18,6 +19,11 @@ build_release_manifest, ) +EXPECTED_CORE_PACKAGE = {"name": "policyengine-core", "version": "3.26.0"} +EXPECTED_COMPATIBLE_CORE_PACKAGES = [ + {"name": "policyengine-core", "specifier": "==3.26.0"} +] + def _write_file(path: Path, content: bytes) -> Path: path.parent.mkdir(parents=True, exist_ok=True) @@ -52,6 +58,8 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): model_package_version="2.74.0", model_package_git_sha="deadbeef", model_package_data_build_fingerprint="sha256:fingerprint", + core_package_metadata=EXPECTED_CORE_PACKAGE, + data_package_git_sha="cafebabe", created_at="2026-04-10T12:00:00Z", ) @@ -66,15 +74,30 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): "specifier": "==2.74.0", } ] + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES assert manifest["build"] == { "build_id": "policyengine-uk-data-1.40.4", "built_at": "2026-04-10T12:00:00Z", + "metadata": { + "data_package_git_sha": "cafebabe", + }, "built_with_model_package": { "name": "policyengine-uk", "version": "2.74.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, }, + "built_with_core_package": EXPECTED_CORE_PACKAGE, + } + assert "created_at" not in manifest + assert manifest["metadata"] == { + "artifact_release": { + "repo_id": "policyengine/policyengine-uk-data-private", + "repo_type": "model", + "version": "1.40.4", + "visibility": "private", + } } assert manifest["default_datasets"] == { "national": "enhanced_frs_2023_24", @@ -86,6 +109,36 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): ) assert manifest["artifacts"]["frs_2023_24"]["sha256"] == _sha256(baseline_bytes) assert manifest["artifacts"]["local_authority_weights"]["kind"] == "weights" + assert manifest["artifacts"]["enhanced_frs_2023_24"]["uri"] == ( + "hf://model/policyengine/policyengine-uk-data-private@1.40.4/" + "enhanced_frs_2023_24.h5" + ) + assert manifest["artifacts"]["enhanced_frs_2023_24"]["metadata"] == { + "repo_type": "model", + "visibility": "private", + } + + +def test_build_release_manifest_validates_against_bundle_contract(tmp_path): + policyengine_bundles = pytest.importorskip("policyengine_bundles") + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + manifest = build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id="policyengine/policyengine-uk-data-private", + model_package_version="2.74.0", + model_package_git_sha="deadbeef", + model_package_data_build_fingerprint="sha256:fingerprint", + core_package_metadata=EXPECTED_CORE_PACKAGE, + data_package_git_sha="cafebabe", + created_at="2026-04-10T12:00:00Z", + ) + + policyengine_bundles.DataReleaseManifest.model_validate(manifest) def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_retry( @@ -113,6 +166,7 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re "specifier": "==1.0.0", } ], + "compatible_core_packages": [], "default_datasets": {}, "created_at": "2026-04-10T12:00:00Z", "artifacts": {}, @@ -192,8 +246,13 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): "version": "2.74.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, }, ), + patch( + "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", + return_value="cafebabe", + ), patch.dict( "policyengine_uk_data.utils.data_upload.os.environ", {"HUGGING_FACE_TOKEN": "token"}, @@ -221,3 +280,14 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): for operation in release_ops: assert isinstance(operation, CommitOperationAdd) assert isinstance(operation.path_or_fileobj, BytesIO) + + payload = release_ops[0].path_or_fileobj.getvalue() + manifest = json.loads(payload.decode("utf-8")) + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES + assert manifest["build"]["built_with_core_package"] == EXPECTED_CORE_PACKAGE + assert manifest["build"]["metadata"] == { + "data_package_git_sha": "cafebabe", + } + assert ( + manifest["build"]["built_with_model_package"]["core"] == EXPECTED_CORE_PACKAGE + ) diff --git a/policyengine_uk_data/utils/data_upload.py b/policyengine_uk_data/utils/data_upload.py index 03040d518..91737dc1c 100644 --- a/policyengine_uk_data/utils/data_upload.py +++ b/policyengine_uk_data/utils/data_upload.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple from huggingface_hub import HfApi, CommitOperationAdd, hf_hub_download from huggingface_hub.errors import EntryNotFoundError, RevisionNotFoundError from google.cloud import storage @@ -10,6 +10,7 @@ import json import logging import os +import subprocess import tomllib from policyengine_uk_data.utils.release_manifest import ( @@ -18,6 +19,7 @@ ) RELEASE_MANIFEST_PATH = "release_manifest.json" +REPO_ROOT = Path(__file__).resolve().parents[2] def _get_model_package_version( @@ -49,23 +51,27 @@ def _get_model_package_version( def _get_model_package_build_metadata( package_name: str = "policyengine-uk", -) -> Dict[str, Optional[str]]: - metadata_payload: Dict[str, Optional[str]] = { +) -> Dict[str, Any]: + metadata_payload: Dict[str, Any] = { "version": _get_model_package_version(package_name), "git_sha": None, "data_build_fingerprint": None, + "core": None, } module_name = package_name.replace("-", "_") try: build_metadata_module = __import__( f"{module_name}.build_metadata", - fromlist=["get_data_build_metadata"], + fromlist=["get_runtime_metadata", "get_data_build_metadata"], ) - get_data_build_metadata = getattr( - build_metadata_module, "get_data_build_metadata", None - ) - if callable(get_data_build_metadata): - package_metadata = get_data_build_metadata() + for metadata_getter_name in ( + "get_runtime_metadata", + "get_data_build_metadata", + ): + metadata_getter = getattr(build_metadata_module, metadata_getter_name, None) + if not callable(metadata_getter): + continue + package_metadata = metadata_getter() metadata_payload["version"] = ( package_metadata.get("version") or metadata_payload["version"] ) @@ -73,6 +79,8 @@ def _get_model_package_build_metadata( metadata_payload["data_build_fingerprint"] = package_metadata.get( "data_build_fingerprint" ) + metadata_payload["core"] = package_metadata.get("core") + break except Exception: logging.warning( "Could not load build metadata from %s while building release manifest.", @@ -82,6 +90,50 @@ def _get_model_package_build_metadata( return metadata_payload +def _get_core_package_runtime_metadata( + package_name: str = "policyengine-core", +) -> Dict[str, Any] | None: + try: + from policyengine_core import get_runtime_metadata + + return dict(get_runtime_metadata()) + except (AttributeError, ImportError): + pass + except Exception: + logging.warning( + "Could not load runtime metadata from %s while building release manifest.", + package_name, + exc_info=True, + ) + + try: + return { + "name": package_name, + "version": metadata.version(package_name), + } + except metadata.PackageNotFoundError: + logging.warning( + "Could not determine installed version for %s while building release manifest.", + package_name, + ) + return None + + +def _get_data_package_git_sha() -> str | None: + try: + return subprocess.check_output( + ["git", "-C", str(REPO_ROOT), "rev-parse", "HEAD"], + stderr=subprocess.DEVNULL, + text=True, + ).strip() + except Exception: + logging.warning( + "Could not determine policyengine-uk-data git SHA while building release manifest.", + exc_info=True, + ) + return None + + def load_release_manifest_from_hf( version: str, hf_repo_name: str = "policyengine/policyengine-uk-data-private", @@ -120,20 +172,26 @@ def create_release_manifest_commit_operations( files_with_repo_paths: List[Tuple[Path, str]], version: str, hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_type: str = "model", model_package_name: str = "policyengine-uk", model_package_version: Optional[str] = None, model_package_git_sha: Optional[str] = None, model_package_data_build_fingerprint: Optional[str] = None, + core_package_metadata: Optional[Mapping[str, Any]] = None, + data_package_git_sha: Optional[str] = None, existing_manifest: Optional[Dict] = None, ) -> Tuple[Dict, List[CommitOperationAdd]]: manifest = build_release_manifest( files_with_repo_paths=files_with_repo_paths, version=version, repo_id=hf_repo_name, + repo_type=hf_repo_type, model_package_name=model_package_name, model_package_version=model_package_version, model_package_git_sha=model_package_git_sha, model_package_data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + data_package_git_sha=data_package_git_sha, existing_manifest=existing_manifest, ) manifest_payload = serialize_release_manifest(manifest) @@ -209,15 +267,21 @@ def upload_files_to_hf( hf_repo_type=hf_repo_type, ) model_build_metadata = _get_model_package_build_metadata() + core_package_metadata = ( + model_build_metadata.get("core") or _get_core_package_runtime_metadata() + ) _, manifest_operations = create_release_manifest_commit_operations( files_with_repo_paths=files_with_repo_paths, version=version, hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, model_package_version=model_build_metadata["version"], model_package_git_sha=model_build_metadata["git_sha"], model_package_data_build_fingerprint=model_build_metadata[ "data_build_fingerprint" ], + core_package_metadata=core_package_metadata, + data_package_git_sha=_get_data_package_git_sha(), existing_manifest=existing_manifest, ) hf_operations.extend(manifest_operations) diff --git a/policyengine_uk_data/utils/release_manifest.py b/policyengine_uk_data/utils/release_manifest.py index f72ed944d..a140d627a 100644 --- a/policyengine_uk_data/utils/release_manifest.py +++ b/policyengine_uk_data/utils/release_manifest.py @@ -5,7 +5,7 @@ import hashlib import json from pathlib import Path, PurePosixPath -from typing import Dict, Mapping, Optional, Sequence, Tuple +from typing import Any, Dict, Mapping, Optional, Sequence, Tuple RELEASE_MANIFEST_SCHEMA_VERSION = 1 @@ -38,6 +38,66 @@ def _artifact_kind(path_in_repo: str) -> str: return "auxiliary" +def _artifact_uri( + *, + repo_id: str, + repo_type: str, + revision: str, + path_in_repo: str, +) -> str: + return f"hf://{repo_type}/{repo_id}@{revision}/{path_in_repo}" + + +def _artifact_visibility(repo_id: str) -> str: + return "private" if repo_id.endswith("-private") else "public" + + +def _without_none_values(payload: Mapping[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in payload.items() if value is not None} + + +def _runtime_component_metadata( + *, + name: str, + version: str | None, + git_sha: str | None = None, + data_build_fingerprint: str | None = None, + core_package_metadata: Mapping[str, Any] | None = None, +) -> Dict[str, Any] | None: + if version is None: + return None + + metadata = _without_none_values( + { + "name": name, + "version": version, + "git_sha": git_sha, + "data_build_fingerprint": data_build_fingerprint, + } + ) + if core_package_metadata is not None: + metadata["core"] = dict(core_package_metadata) + return metadata + + +def _build_metadata( + *, + data_package_git_sha: str | None, +) -> Dict[str, Any]: + return _without_none_values( + { + "data_package_git_sha": data_package_git_sha, + } + ) + + +def _core_version(core_package_metadata: Mapping[str, Any] | None) -> str | None: + if core_package_metadata is None: + return None + version = core_package_metadata.get("version") + return version if isinstance(version, str) and version else None + + def _base_manifest( *, version: str, @@ -46,9 +106,14 @@ def _base_manifest( model_package_version: str | None, model_package_git_sha: str | None, model_package_data_build_fingerprint: str | None, + core_package_metadata: Mapping[str, Any] | None, + data_package_git_sha: str | None, + repo_id: str, + repo_type: str, build_id: str, created_at: str, ) -> Dict: + build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) manifest = { "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, "data_package": { @@ -56,25 +121,35 @@ def _base_manifest( "version": version, }, "compatible_model_packages": [], + "compatible_core_packages": [], "default_datasets": {}, - "created_at": created_at, "build": { "build_id": build_id, "built_at": created_at, }, "artifacts": {}, + "metadata": { + "artifact_release": { + "repo_id": repo_id, + "repo_type": repo_type, + "version": version, + "visibility": _artifact_visibility(repo_id), + } + }, } - if ( - model_package_version - or model_package_git_sha - or model_package_data_build_fingerprint - ): - manifest["build"]["built_with_model_package"] = { - "name": model_package_name, - "version": model_package_version, - "git_sha": model_package_git_sha, - "data_build_fingerprint": model_package_data_build_fingerprint, - } + if build_metadata: + manifest["build"]["metadata"] = build_metadata + model_package_metadata = _runtime_component_metadata( + name=model_package_name, + version=model_package_version, + git_sha=model_package_git_sha, + data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + ) + if model_package_metadata is not None: + manifest["build"]["built_with_model_package"] = model_package_metadata + if core_package_metadata is not None: + manifest["build"]["built_with_core_package"] = dict(core_package_metadata) if model_package_version: manifest["compatible_model_packages"].append( { @@ -82,6 +157,14 @@ def _base_manifest( "specifier": f"=={model_package_version}", } ) + core_version = _core_version(core_package_metadata) + if core_version: + manifest["compatible_core_packages"].append( + { + "name": core_package_metadata.get("name", "policyengine-core"), + "specifier": f"=={core_version}", + } + ) return manifest @@ -96,7 +179,9 @@ def _normalize_existing_manifest( package = existing_manifest.get("data_package", {}) if package.get("name") != data_package_name or package.get("version") != version: return None - return deepcopy(dict(existing_manifest)) + manifest = deepcopy(dict(existing_manifest)) + manifest.pop("created_at", None) + return manifest def build_release_manifest( @@ -104,11 +189,14 @@ def build_release_manifest( files_with_repo_paths: Sequence[Tuple[Path | str, str]], version: str, repo_id: str, + repo_type: str = "model", data_package_name: str = "policyengine-uk-data", model_package_name: str = "policyengine-uk", model_package_version: str | None = None, model_package_git_sha: str | None = None, model_package_data_build_fingerprint: str | None = None, + core_package_metadata: Optional[Mapping[str, Any]] = None, + data_package_git_sha: str | None = None, build_id: str | None = None, existing_manifest: Mapping | None = None, default_datasets: Optional[Mapping[str, str]] = None, @@ -130,26 +218,39 @@ def build_release_manifest( model_package_version=model_package_version, model_package_git_sha=model_package_git_sha, model_package_data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + data_package_git_sha=data_package_git_sha, + repo_id=repo_id, + repo_type=repo_type, build_id=resolved_build_id, created_at=manifest_timestamp, ) else: manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION - manifest["created_at"] = manifest_timestamp + manifest.setdefault("compatible_core_packages", []) + manifest.setdefault("metadata", {})["artifact_release"] = { + "repo_id": repo_id, + "repo_type": repo_type, + "version": version, + "visibility": _artifact_visibility(repo_id), + } manifest.setdefault("build", {}) manifest["build"].setdefault("build_id", resolved_build_id) manifest["build"].setdefault("built_at", manifest_timestamp) - if ( - model_package_version - or model_package_git_sha - or model_package_data_build_fingerprint - ): - manifest["build"]["built_with_model_package"] = { - "name": model_package_name, - "version": model_package_version, - "git_sha": model_package_git_sha, - "data_build_fingerprint": model_package_data_build_fingerprint, - } + build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) + if build_metadata: + manifest["build"].setdefault("metadata", {}).update(build_metadata) + model_package_metadata = _runtime_component_metadata( + name=model_package_name, + version=model_package_version, + git_sha=model_package_git_sha, + data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + ) + if model_package_metadata is not None: + manifest["build"]["built_with_model_package"] = model_package_metadata + if core_package_metadata is not None: + manifest["build"]["built_with_core_package"] = dict(core_package_metadata) if model_package_version: manifest["compatible_model_packages"] = [ { @@ -157,6 +258,14 @@ def build_release_manifest( "specifier": f"=={model_package_version}", } ] + core_version = _core_version(core_package_metadata) + if core_version: + manifest["compatible_core_packages"] = [ + { + "name": core_package_metadata.get("name", "policyengine-core"), + "specifier": f"=={core_version}", + } + ] if default_datasets: manifest.setdefault("default_datasets", {}).update(default_datasets) @@ -165,11 +274,21 @@ def build_release_manifest( local_path = Path(local_path) manifest["artifacts"][_artifact_key(path_in_repo)] = { "kind": _artifact_kind(path_in_repo), + "uri": _artifact_uri( + repo_id=repo_id, + repo_type=repo_type, + revision=version, + path_in_repo=path_in_repo, + ), "path": path_in_repo, "repo_id": repo_id, "revision": version, "sha256": _compute_file_checksum(local_path), "size_bytes": local_path.stat().st_size, + "metadata": { + "repo_type": repo_type, + "visibility": _artifact_visibility(repo_id), + }, } defaults = manifest["default_datasets"] From d71ec2f2988154c654ce889076aa20b36b7e3365 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 19:47:15 +0200 Subject: [PATCH 2/6] Harden UK data release finalization --- .github/workflows/pull_request.yaml | 2 +- .../tests/test_release_manifest.py | 119 +++++++++++++++++- policyengine_uk_data/utils/data_upload.py | 106 +++++++++++++--- 3 files changed, 206 insertions(+), 21 deletions(-) diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index 497e826ee..158864a2e 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -35,7 +35,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: 3.13 + python-version: 3.14 - name: Install uv uses: astral-sh/setup-uv@v8.1.0 - name: Install package diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index 036679450..de9174977 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -7,7 +7,7 @@ import pytest from huggingface_hub import CommitOperationAdd -from huggingface_hub.errors import EntryNotFoundError +from huggingface_hub.errors import EntryNotFoundError, RevisionNotFoundError from policyengine_uk_data.utils.data_upload import ( _get_model_package_version, @@ -203,6 +203,40 @@ def test_load_release_manifest_from_hf_continues_on_missing_entry(tmp_path): assert manifest["data_package"]["version"] == "1.40.4" +def test_load_release_manifest_from_hf_uses_explicit_revision_when_requested(tmp_path): + manifest_path = tmp_path / "release_manifest.json" + manifest_path.write_text('{"data_package": {"version": "1.40.4"}}') + + with patch( + "policyengine_uk_data.utils.data_upload.hf_hub_download", + return_value=str(manifest_path), + ) as mock_download: + manifest = load_release_manifest_from_hf( + version="1.40.4", + revision="1.40.4", + ) + + assert manifest["data_package"]["version"] == "1.40.4" + assert mock_download.call_args.kwargs["revision"] == "1.40.4" + + +def test_load_release_manifest_from_hf_returns_none_when_revision_is_missing(): + with patch( + "policyengine_uk_data.utils.data_upload.hf_hub_download", + side_effect=RevisionNotFoundError( + "missing revision", + response=MagicMock(), + ), + ): + assert ( + load_release_manifest_from_hf( + version="1.40.4", + revision="1.40.4", + ) + is None + ) + + def test_get_model_package_version_prefers_imported_checkout(tmp_path): package_root = tmp_path / "policyengine_uk" package_root.mkdir() @@ -291,3 +325,86 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): assert ( manifest["build"]["built_with_model_package"]["core"] == EXPECTED_CORE_PACKAGE ) + + +def test_upload_files_to_hf_rejects_finalized_release(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + finalized_manifest = { + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [ + {"name": "policyengine-uk", "specifier": "==2.74.0"} + ], + "default_datasets": {"national": "enhanced_frs_2023_24"}, + "artifacts": { + "enhanced_frs_2023_24": { + "kind": "microdata", + "path": "enhanced_frs_2023_24.h5", + "repo_id": "policyengine/policyengine-uk-data-private", + "revision": "1.40.4", + "sha256": _sha256(b"enhanced-frs"), + "size_bytes": len(b"enhanced-frs"), + } + }, + } + mock_api = MagicMock() + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + side_effect=lambda *args, **kwargs: ( + finalized_manifest if kwargs.get("revision") == "1.40.4" else None + ), + ), + ): + with pytest.raises(RuntimeError, match="already finalized"): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) + + mock_api.create_commit.assert_not_called() + + +def test_upload_files_to_hf_rejects_tag_that_points_to_different_revision(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + mock_api = MagicMock() + mock_api.create_commit.return_value = MagicMock(oid="new-commit") + mock_api.create_tag.side_effect = RuntimeError("409 Tag reference exists already") + mock_api.repo_info.return_value = MagicMock(sha="old-commit") + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + return_value=None, + ), + patch( + "policyengine_uk_data.utils.data_upload._get_model_package_build_metadata", + return_value={ + "version": "2.74.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, + }, + ), + patch( + "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", + return_value="cafebabe", + ), + ): + with pytest.raises(RuntimeError, match="refusing to treat new-commit"): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) diff --git a/policyengine_uk_data/utils/data_upload.py b/policyengine_uk_data/utils/data_upload.py index 91737dc1c..93546a818 100644 --- a/policyengine_uk_data/utils/data_upload.py +++ b/policyengine_uk_data/utils/data_upload.py @@ -138,6 +138,7 @@ def load_release_manifest_from_hf( version: str, hf_repo_name: str = "policyengine/policyengine-uk-data-private", hf_repo_type: str = "model", + revision: Optional[str] = None, ) -> Optional[Dict]: token = os.environ.get("HUGGING_FACE_TOKEN") candidate_paths = [ @@ -152,8 +153,11 @@ def load_release_manifest_from_hf( filename=path_in_repo, repo_type=hf_repo_type, token=token, + revision=revision, ) except RevisionNotFoundError: + if revision is not None: + return None raise except EntryNotFoundError: continue @@ -168,6 +172,76 @@ def load_release_manifest_from_hf( return None +def assert_release_not_finalized( + version: str, + hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_type: str = "model", +) -> None: + finalized_manifest = load_release_manifest_from_hf( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + revision=version, + ) + if finalized_manifest is not None: + raise RuntimeError( + f"Release {version} is already finalized on {hf_repo_name}. " + "Refusing to mutate release manifest state after the tag exists." + ) + + +def create_release_tag( + version: str, + revision: str, + hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, +) -> None: + api = api or HfApi() + token = token or os.environ.get("HUGGING_FACE_TOKEN") + try: + api.create_tag( + token=token, + repo_id=hf_repo_name, + tag=version, + revision=revision, + repo_type=hf_repo_type, + exist_ok=False, + ) + logging.info( + "Tagged revision %s with %s in Hugging Face repository %s.", + revision, + version, + hf_repo_name, + ) + except Exception as e: + if "Tag reference exists already" in str(e) or "409" in str(e): + tagged_revision = getattr( + api.repo_info( + repo_id=hf_repo_name, + repo_type=hf_repo_type, + revision=version, + token=token, + ), + "sha", + None, + ) + if tagged_revision == revision: + logging.info( + "Tag %s already exists in %s and already points to %s.", + version, + hf_repo_name, + revision, + ) + return + raise RuntimeError( + f"Tag {version} already exists in {hf_repo_name} at " + f"{tagged_revision}; refusing to treat {revision} as finalized." + ) from e + raise + + def create_release_manifest_commit_operations( files_with_repo_paths: List[Tuple[Path, str]], version: str, @@ -245,6 +319,11 @@ def upload_files_to_hf( token = os.environ.get( "HUGGING_FACE_TOKEN", ) + assert_release_not_finalized( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + ) hf_operations = [] files_with_repo_paths = [] @@ -295,25 +374,14 @@ def upload_files_to_hf( ) logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.") - # Tag commit with version - try: - api.create_tag( - token=token, - repo_id=hf_repo_name, - tag=version, - revision=commit_info.oid, - repo_type=hf_repo_type, - ) - logging.info( - f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}." - ) - except Exception as e: - if "Tag reference exists already" in str(e) or "409" in str(e): - logging.warning( - f"Tag {version} already exists in {hf_repo_name}. Skipping tag creation." - ) - else: - raise + create_release_tag( + version=version, + revision=commit_info.oid, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) def upload_files_to_gcs( From 59e5fed8ade54c5338e900c97a93e71bb6ff5949 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 20:33:03 +0200 Subject: [PATCH 3/6] Clarify core metadata test fixture --- policyengine_uk_data/tests/test_release_manifest.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index de9174977..323e6142b 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -19,9 +19,14 @@ build_release_manifest, ) -EXPECTED_CORE_PACKAGE = {"name": "policyengine-core", "version": "3.26.0"} +# Synthetic fixture: this verifies manifest propagation, not the package dep range. +CORE_FIXTURE_VERSION = "9.8.7" +EXPECTED_CORE_PACKAGE = { + "name": "policyengine-core", + "version": CORE_FIXTURE_VERSION, +} EXPECTED_COMPATIBLE_CORE_PACKAGES = [ - {"name": "policyengine-core", "specifier": "==3.26.0"} + {"name": "policyengine-core", "specifier": f"=={CORE_FIXTURE_VERSION}"} ] From b2172c6b87607d555c87f3ef167c0a055bd22b6f Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 20:43:23 +0200 Subject: [PATCH 4/6] Test UK data draft manifest refresh --- .../tests/test_release_manifest.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index 323e6142b..c838e8cee 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -332,6 +332,80 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): ) +def test_upload_files_to_hf_refreshes_same_version_unfinalized_manifest(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs-v2", + ) + existing_manifest = { + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [ + { + "name": "policyengine-uk", + "specifier": "==2.0.0", + } + ], + "compatible_core_packages": [], + "default_datasets": {}, + "created_at": "2026-04-10T12:00:00Z", + "artifacts": {}, + } + mock_api = MagicMock() + mock_api.create_commit.return_value = MagicMock(oid="commit-sha") + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + side_effect=lambda *args, **kwargs: ( + None if kwargs.get("revision") == "1.40.4" else existing_manifest + ), + ), + patch( + "policyengine_uk_data.utils.data_upload._get_model_package_build_metadata", + return_value={ + "version": "2.74.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, + }, + ), + patch( + "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", + return_value="cafebabe", + ), + ): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) + + operations = mock_api.create_commit.call_args.kwargs["operations"] + release_op = next( + operation + for operation in operations + if operation.path_in_repo == "release_manifest.json" + ) + manifest = json.loads(release_op.path_or_fileobj.getvalue().decode("utf-8")) + + assert "created_at" not in manifest + assert manifest["compatible_model_packages"] == [ + {"name": "policyengine-uk", "specifier": "==2.74.0"} + ] + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES + assert manifest["build"]["metadata"] == { + "data_package_git_sha": "cafebabe", + } + assert manifest["build"]["built_with_core_package"] == EXPECTED_CORE_PACKAGE + assert manifest["artifacts"]["enhanced_frs_2023_24"]["sha256"] == _sha256( + b"enhanced-frs-v2" + ) + + def test_upload_files_to_hf_rejects_finalized_release(tmp_path): dataset_path = _write_file( tmp_path / "enhanced_frs_2023_24.h5", From c1d8f2da001028216a4038b1edd977d66791d3b8 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 22:59:59 +0200 Subject: [PATCH 5/6] Enforce UK data single release version --- .../tests/test_release_manifest.py | 63 +++++++++++++++++++ .../utils/release_manifest.py | 40 ++++++++---- 2 files changed, 91 insertions(+), 12 deletions(-) diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index c838e8cee..c80761cbd 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -40,6 +40,17 @@ def _sha256(content: bytes) -> str: return hashlib.sha256(content).hexdigest() +def _assert_single_uk_data_release_version(manifest: dict) -> None: + """UK data uses one version for package code, HF tags, and artifacts.""" + + release_version = manifest["data_package"]["version"] + assert manifest["metadata"]["artifact_release"]["version"] == release_version + + for artifact in manifest["artifacts"].values(): + assert artifact["revision"] == release_version + assert f"@{release_version}/" in artifact["uri"] + + def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): enhanced_bytes = b"enhanced-frs" baseline_bytes = b"baseline-frs" @@ -104,6 +115,7 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): "visibility": "private", } } + _assert_single_uk_data_release_version(manifest) assert manifest["default_datasets"] == { "national": "enhanced_frs_2023_24", "baseline": "frs_2023_24", @@ -181,6 +193,55 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re assert manifest["compatible_model_packages"] == [ {"name": "policyengine-uk", "specifier": "==9.99.9"} ] + _assert_single_uk_data_release_version(manifest) + + +def test_build_release_manifest_refreshes_draft_artifact_release_version(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + manifest = build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id="policyengine/policyengine-uk-data-private", + existing_manifest={ + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [], + "compatible_core_packages": [], + "default_datasets": {}, + "metadata": { + "artifact_release": { + "repo_id": "policyengine/policyengine-uk-data-private", + "repo_type": "model", + "version": "stale-draft-version", + "visibility": "private", + } + }, + "artifacts": { + "enhanced_frs_2023_24": { + "kind": "microdata", + "uri": "hf://model/policyengine/policyengine-uk-data-private@stale-draft-version/enhanced_frs_2023_24.h5", + "path": "enhanced_frs_2023_24.h5", + "repo_id": "policyengine/policyengine-uk-data-private", + "revision": "stale-draft-version", + "sha256": "stale", + "size_bytes": 5, + "metadata": { + "repo_type": "model", + "visibility": "private", + }, + } + }, + }, + ) + + _assert_single_uk_data_release_version(manifest) def test_load_release_manifest_from_hf_raises_non_missing_download_errors(): @@ -322,6 +383,7 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): payload = release_ops[0].path_or_fileobj.getvalue() manifest = json.loads(payload.decode("utf-8")) + _assert_single_uk_data_release_version(manifest) assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES assert manifest["build"]["built_with_core_package"] == EXPECTED_CORE_PACKAGE assert manifest["build"]["metadata"] == { @@ -393,6 +455,7 @@ def test_upload_files_to_hf_refreshes_same_version_unfinalized_manifest(tmp_path manifest = json.loads(release_op.path_or_fileobj.getvalue().decode("utf-8")) assert "created_at" not in manifest + _assert_single_uk_data_release_version(manifest) assert manifest["compatible_model_packages"] == [ {"name": "policyengine-uk", "specifier": "==2.74.0"} ] diff --git a/policyengine_uk_data/utils/release_manifest.py b/policyengine_uk_data/utils/release_manifest.py index a140d627a..dab241407 100644 --- a/policyengine_uk_data/utils/release_manifest.py +++ b/policyengine_uk_data/utils/release_manifest.py @@ -52,6 +52,22 @@ def _artifact_visibility(repo_id: str) -> str: return "private" if repo_id.endswith("-private") else "public" +def _artifact_release_metadata( + *, + repo_id: str, + repo_type: str, + version: str, +) -> Dict[str, str]: + # UK data uses one release coordinate across package code, HF tags, and + # published dataset artifacts. Do not treat this as a separate artifact version. + return { + "repo_id": repo_id, + "repo_type": repo_type, + "version": version, + "visibility": _artifact_visibility(repo_id), + } + + def _without_none_values(payload: Mapping[str, Any]) -> Dict[str, Any]: return {key: value for key, value in payload.items() if value is not None} @@ -129,12 +145,11 @@ def _base_manifest( }, "artifacts": {}, "metadata": { - "artifact_release": { - "repo_id": repo_id, - "repo_type": repo_type, - "version": version, - "visibility": _artifact_visibility(repo_id), - } + "artifact_release": _artifact_release_metadata( + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) }, } if build_metadata: @@ -228,12 +243,13 @@ def build_release_manifest( else: manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION manifest.setdefault("compatible_core_packages", []) - manifest.setdefault("metadata", {})["artifact_release"] = { - "repo_id": repo_id, - "repo_type": repo_type, - "version": version, - "visibility": _artifact_visibility(repo_id), - } + manifest.setdefault("metadata", {})["artifact_release"] = ( + _artifact_release_metadata( + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + ) manifest.setdefault("build", {}) manifest["build"].setdefault("build_id", resolved_build_id) manifest["build"].setdefault("built_at", manifest_timestamp) From 8204433c1c70872f9d2399644b583524fb87fd77 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 5 May 2026 23:23:44 +0200 Subject: [PATCH 6/6] Harden UK data release manifest uploads --- .../tests/test_release_manifest.py | 76 +++-- policyengine_uk_data/utils/data_upload.py | 67 +++- .../utils/release_manifest.py | 314 +++++++++++------- 3 files changed, 283 insertions(+), 174 deletions(-) diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index c80761cbd..6c1f459dc 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -14,6 +14,7 @@ load_release_manifest_from_hf, upload_files_to_hf, ) +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO from policyengine_uk_data.utils.release_manifest import ( RELEASE_MANIFEST_SCHEMA_VERSION, build_release_manifest, @@ -30,6 +31,13 @@ ] +def _missing_revision_error() -> RevisionNotFoundError: + return RevisionNotFoundError( + "missing revision", + response=MagicMock(), + ) + + def _write_file(path: Path, content: bytes) -> Path: path.parent.mkdir(parents=True, exist_ok=True) path.write_bytes(content) @@ -70,7 +78,7 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): (weights_path, "local_authority_weights.h5"), ], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, model_package_version="2.74.0", model_package_git_sha="deadbeef", model_package_data_build_fingerprint="sha256:fingerprint", @@ -109,7 +117,7 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): assert "created_at" not in manifest assert manifest["metadata"] == { "artifact_release": { - "repo_id": "policyengine/policyengine-uk-data-private", + "repo_id": PRIVATE_REPO, "repo_type": "model", "version": "1.40.4", "visibility": "private", @@ -127,8 +135,7 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): assert manifest["artifacts"]["frs_2023_24"]["sha256"] == _sha256(baseline_bytes) assert manifest["artifacts"]["local_authority_weights"]["kind"] == "weights" assert manifest["artifacts"]["enhanced_frs_2023_24"]["uri"] == ( - "hf://model/policyengine/policyengine-uk-data-private@1.40.4/" - "enhanced_frs_2023_24.h5" + f"hf://model/{PRIVATE_REPO}@1.40.4/enhanced_frs_2023_24.h5" ) assert manifest["artifacts"]["enhanced_frs_2023_24"]["metadata"] == { "repo_type": "model", @@ -146,7 +153,7 @@ def test_build_release_manifest_validates_against_bundle_contract(tmp_path): manifest = build_release_manifest( files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, model_package_version="2.74.0", model_package_git_sha="deadbeef", model_package_data_build_fingerprint="sha256:fingerprint", @@ -158,6 +165,20 @@ def test_build_release_manifest_validates_against_bundle_contract(tmp_path): policyengine_bundles.DataReleaseManifest.model_validate(manifest) +def test_build_release_manifest_rejects_unknown_hf_repo(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + with pytest.raises(ValueError, match="Unknown UK data Hugging Face repo"): + build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id="policyengine/policyengine-uk-data-private-copy", + ) + + def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_retry( tmp_path, ): @@ -169,7 +190,7 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re manifest = build_release_manifest( files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, model_package_version="9.99.9", existing_manifest={ "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, @@ -205,7 +226,7 @@ def test_build_release_manifest_refreshes_draft_artifact_release_version(tmp_pat manifest = build_release_manifest( files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, existing_manifest={ "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, "data_package": { @@ -217,7 +238,7 @@ def test_build_release_manifest_refreshes_draft_artifact_release_version(tmp_pat "default_datasets": {}, "metadata": { "artifact_release": { - "repo_id": "policyengine/policyengine-uk-data-private", + "repo_id": PRIVATE_REPO, "repo_type": "model", "version": "stale-draft-version", "visibility": "private", @@ -226,9 +247,9 @@ def test_build_release_manifest_refreshes_draft_artifact_release_version(tmp_pat "artifacts": { "enhanced_frs_2023_24": { "kind": "microdata", - "uri": "hf://model/policyengine/policyengine-uk-data-private@stale-draft-version/enhanced_frs_2023_24.h5", + "uri": f"hf://model/{PRIVATE_REPO}@stale-draft-version/enhanced_frs_2023_24.h5", "path": "enhanced_frs_2023_24.h5", - "repo_id": "policyengine/policyengine-uk-data-private", + "repo_id": PRIVATE_REPO, "revision": "stale-draft-version", "sha256": "stale", "size_bytes": 5, @@ -289,10 +310,7 @@ def test_load_release_manifest_from_hf_uses_explicit_revision_when_requested(tmp def test_load_release_manifest_from_hf_returns_none_when_revision_is_missing(): with patch( "policyengine_uk_data.utils.data_upload.hf_hub_download", - side_effect=RevisionNotFoundError( - "missing revision", - response=MagicMock(), - ), + side_effect=_missing_revision_error(), ): assert ( load_release_manifest_from_hf( @@ -333,6 +351,7 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): mock_api = MagicMock() mock_api.create_commit.return_value = MagicMock(oid="commit-sha") + mock_api.repo_info.side_effect = _missing_revision_error() with ( patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), @@ -418,6 +437,7 @@ def test_upload_files_to_hf_refreshes_same_version_unfinalized_manifest(tmp_path } mock_api = MagicMock() mock_api.create_commit.return_value = MagicMock(oid="commit-sha") + mock_api.repo_info.side_effect = _missing_revision_error() with ( patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), @@ -488,7 +508,7 @@ def test_upload_files_to_hf_rejects_finalized_release(tmp_path): "enhanced_frs_2023_24": { "kind": "microdata", "path": "enhanced_frs_2023_24.h5", - "repo_id": "policyengine/policyengine-uk-data-private", + "repo_id": PRIVATE_REPO, "revision": "1.40.4", "sha256": _sha256(b"enhanced-frs"), "size_bytes": len(b"enhanced-frs"), @@ -496,6 +516,7 @@ def test_upload_files_to_hf_rejects_finalized_release(tmp_path): }, } mock_api = MagicMock() + mock_api.repo_info.side_effect = _missing_revision_error() with ( patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), @@ -515,14 +536,12 @@ def test_upload_files_to_hf_rejects_finalized_release(tmp_path): mock_api.create_commit.assert_not_called() -def test_upload_files_to_hf_rejects_tag_that_points_to_different_revision(tmp_path): +def test_upload_files_to_hf_rejects_existing_tag_before_commit(tmp_path): dataset_path = _write_file( tmp_path / "enhanced_frs_2023_24.h5", b"enhanced-frs", ) mock_api = MagicMock() - mock_api.create_commit.return_value = MagicMock(oid="new-commit") - mock_api.create_tag.side_effect = RuntimeError("409 Tag reference exists already") mock_api.repo_info.return_value = MagicMock(sha="old-commit") with ( @@ -530,23 +549,14 @@ def test_upload_files_to_hf_rejects_tag_that_points_to_different_revision(tmp_pa patch( "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", return_value=None, - ), - patch( - "policyengine_uk_data.utils.data_upload._get_model_package_build_metadata", - return_value={ - "version": "2.74.0", - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - "core": EXPECTED_CORE_PACKAGE, - }, - ), - patch( - "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", - return_value="cafebabe", - ), + ) as mock_load_release_manifest, ): - with pytest.raises(RuntimeError, match="refusing to treat new-commit"): + with pytest.raises(RuntimeError, match="already finalized"): upload_files_to_hf( files=[dataset_path], version="1.40.4", ) + + mock_load_release_manifest.assert_not_called() + mock_api.create_commit.assert_not_called() + mock_api.create_tag.assert_not_called() diff --git a/policyengine_uk_data/utils/data_upload.py b/policyengine_uk_data/utils/data_upload.py index 93546a818..8c54f7c97 100644 --- a/policyengine_uk_data/utils/data_upload.py +++ b/policyengine_uk_data/utils/data_upload.py @@ -17,6 +17,7 @@ build_release_manifest, serialize_release_manifest, ) +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO, PUBLIC_REPO RELEASE_MANIFEST_PATH = "release_manifest.json" REPO_ROOT = Path(__file__).resolve().parents[2] @@ -136,7 +137,7 @@ def _get_data_package_git_sha() -> str | None: def load_release_manifest_from_hf( version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", revision: Optional[str] = None, ) -> Optional[Dict]: @@ -172,11 +173,48 @@ def load_release_manifest_from_hf( return None +def _get_release_tag_revision( + version: str, + hf_repo_name: str = PRIVATE_REPO, + hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, +) -> Optional[str]: + api = api or HfApi() + token = token or os.environ.get("HUGGING_FACE_TOKEN") + try: + repo_info = api.repo_info( + repo_id=hf_repo_name, + repo_type=hf_repo_type, + revision=version, + token=token, + ) + return getattr(repo_info, "sha", None) or "" + except RevisionNotFoundError: + return None + + def assert_release_not_finalized( version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, ) -> None: + tagged_revision = _get_release_tag_revision( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) + if tagged_revision is not None: + raise RuntimeError( + f"Release {version} is already finalized on {hf_repo_name} at " + f"{tagged_revision}. Refusing to mutate release manifest state " + "after the tag exists." + ) + finalized_manifest = load_release_manifest_from_hf( version=version, hf_repo_name=hf_repo_name, @@ -193,7 +231,7 @@ def assert_release_not_finalized( def create_release_tag( version: str, revision: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", token: Optional[str] = None, api: Optional[HfApi] = None, @@ -217,15 +255,12 @@ def create_release_tag( ) except Exception as e: if "Tag reference exists already" in str(e) or "409" in str(e): - tagged_revision = getattr( - api.repo_info( - repo_id=hf_repo_name, - repo_type=hf_repo_type, - revision=version, - token=token, - ), - "sha", - None, + tagged_revision = _get_release_tag_revision( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, ) if tagged_revision == revision: logging.info( @@ -245,7 +280,7 @@ def create_release_tag( def create_release_manifest_commit_operations( files_with_repo_paths: List[Tuple[Path, str]], version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", model_package_name: str = "policyengine-uk", model_package_version: Optional[str] = None, @@ -285,7 +320,7 @@ def create_release_manifest_commit_operations( def upload_data_files( files: List[str], gcs_bucket_name: str = "policyengine-uk-data-private", - hf_repo_name: str = "policyengine/policyengine-uk-data", + hf_repo_name: str = PUBLIC_REPO, hf_repo_type: str = "model", version: str = None, ): @@ -309,7 +344,7 @@ def upload_data_files( def upload_files_to_hf( files: List[str], version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", ): """ @@ -323,6 +358,8 @@ def upload_files_to_hf( version=version, hf_repo_name=hf_repo_name, hf_repo_type=hf_repo_type, + token=token, + api=api, ) hf_operations = [] files_with_repo_paths = [] diff --git a/policyengine_uk_data/utils/release_manifest.py b/policyengine_uk_data/utils/release_manifest.py index dab241407..e9eb90b09 100644 --- a/policyengine_uk_data/utils/release_manifest.py +++ b/policyengine_uk_data/utils/release_manifest.py @@ -7,6 +7,8 @@ from pathlib import Path, PurePosixPath from typing import Any, Dict, Mapping, Optional, Sequence, Tuple +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO, PUBLIC_REPO + RELEASE_MANIFEST_SCHEMA_VERSION = 1 @@ -49,7 +51,14 @@ def _artifact_uri( def _artifact_visibility(repo_id: str) -> str: - return "private" if repo_id.endswith("-private") else "public" + if repo_id == PRIVATE_REPO: + return "private" + if repo_id == PUBLIC_REPO: + return "public" + raise ValueError( + f"Unknown UK data Hugging Face repo {repo_id!r}; use " + "PRIVATE_REPO or PUBLIC_REPO." + ) def _artifact_release_metadata( @@ -114,23 +123,42 @@ def _core_version(core_package_metadata: Mapping[str, Any] | None) -> str | None return version if isinstance(version, str) and version else None -def _base_manifest( +def _model_package_compatibility( *, - version: str, - data_package_name: str, model_package_name: str, model_package_version: str | None, - model_package_git_sha: str | None, - model_package_data_build_fingerprint: str | None, +) -> list[Dict[str, str]]: + if not model_package_version: + return [] + return [ + { + "name": model_package_name, + "specifier": f"=={model_package_version}", + } + ] + + +def _core_package_compatibility( + *, core_package_metadata: Mapping[str, Any] | None, - data_package_git_sha: str | None, - repo_id: str, - repo_type: str, - build_id: str, - created_at: str, +) -> list[Dict[str, str]]: + core_version = _core_version(core_package_metadata) + if not core_version or core_package_metadata is None: + return [] + return [ + { + "name": core_package_metadata.get("name", "policyengine-core"), + "specifier": f"=={core_version}", + } + ] + + +def _new_release_manifest( + *, + version: str, + data_package_name: str, ) -> Dict: - build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) - manifest = { + return { "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, "data_package": { "name": data_package_name, @@ -139,21 +167,49 @@ def _base_manifest( "compatible_model_packages": [], "compatible_core_packages": [], "default_datasets": {}, - "build": { - "build_id": build_id, - "built_at": created_at, - }, + "build": {}, "artifacts": {}, - "metadata": { - "artifact_release": _artifact_release_metadata( - repo_id=repo_id, - repo_type=repo_type, - version=version, - ) - }, + "metadata": {}, } + + +def _update_manifest_metadata( + manifest: Dict, + *, + repo_id: str, + repo_type: str, + version: str, +) -> None: + manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION + manifest.setdefault("metadata", {})["artifact_release"] = ( + _artifact_release_metadata( + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + ) + + +def _update_build_section( + manifest: Dict, + *, + build_id: str, + created_at: str, + data_package_git_sha: str | None, + model_package_name: str, + model_package_version: str | None, + model_package_git_sha: str | None, + model_package_data_build_fingerprint: str | None, + core_package_metadata: Mapping[str, Any] | None, +) -> None: + build = manifest.setdefault("build", {}) + build.setdefault("build_id", build_id) + build.setdefault("built_at", created_at) + + build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) if build_metadata: - manifest["build"]["metadata"] = build_metadata + build.setdefault("metadata", {}).update(build_metadata) + model_package_metadata = _runtime_component_metadata( name=model_package_name, version=model_package_version, @@ -162,25 +218,79 @@ def _base_manifest( core_package_metadata=core_package_metadata, ) if model_package_metadata is not None: - manifest["build"]["built_with_model_package"] = model_package_metadata + build["built_with_model_package"] = model_package_metadata if core_package_metadata is not None: - manifest["build"]["built_with_core_package"] = dict(core_package_metadata) - if model_package_version: - manifest["compatible_model_packages"].append( - { - "name": model_package_name, - "specifier": f"=={model_package_version}", - } - ) - core_version = _core_version(core_package_metadata) - if core_version: - manifest["compatible_core_packages"].append( - { - "name": core_package_metadata.get("name", "policyengine-core"), - "specifier": f"=={core_version}", - } - ) - return manifest + build["built_with_core_package"] = dict(core_package_metadata) + + +def _update_compatibility( + manifest: Dict, + *, + model_package_name: str, + model_package_version: str | None, + core_package_metadata: Mapping[str, Any] | None, +) -> None: + manifest.setdefault("compatible_model_packages", []) + model_package_compatibility = _model_package_compatibility( + model_package_name=model_package_name, + model_package_version=model_package_version, + ) + if model_package_compatibility: + manifest["compatible_model_packages"] = model_package_compatibility + + manifest.setdefault("compatible_core_packages", []) + core_package_compatibility = _core_package_compatibility( + core_package_metadata=core_package_metadata, + ) + if core_package_compatibility: + manifest["compatible_core_packages"] = core_package_compatibility + + +def _update_artifacts( + manifest: Dict, + *, + files_with_repo_paths: Sequence[Tuple[Path | str, str]], + repo_id: str, + repo_type: str, + version: str, +) -> None: + artifacts = manifest.setdefault("artifacts", {}) + for local_path, path_in_repo in files_with_repo_paths: + local_path = Path(local_path) + artifacts[_artifact_key(path_in_repo)] = { + "kind": _artifact_kind(path_in_repo), + "uri": _artifact_uri( + repo_id=repo_id, + repo_type=repo_type, + revision=version, + path_in_repo=path_in_repo, + ), + "path": path_in_repo, + "repo_id": repo_id, + "revision": version, + "sha256": _compute_file_checksum(local_path), + "size_bytes": local_path.stat().st_size, + "metadata": { + "repo_type": repo_type, + "visibility": _artifact_visibility(repo_id), + }, + } + + +def _update_default_datasets( + manifest: Dict, + *, + default_datasets: Optional[Mapping[str, str]], +) -> None: + defaults = manifest.setdefault("default_datasets", {}) + if default_datasets: + defaults.update(default_datasets) + if "national" not in defaults and "enhanced_frs_2023_24" in manifest.get( + "artifacts", {} + ): + defaults["national"] = "enhanced_frs_2023_24" + if "baseline" not in defaults and "frs_2023_24" in manifest.get("artifacts", {}): + defaults["baseline"] = "frs_2023_24" def _normalize_existing_manifest( @@ -226,93 +336,45 @@ def build_release_manifest( resolved_build_id = build_id or f"{data_package_name}-{version}" if manifest is None: - manifest = _base_manifest( + manifest = _new_release_manifest( version=version, data_package_name=data_package_name, - model_package_name=model_package_name, - model_package_version=model_package_version, - model_package_git_sha=model_package_git_sha, - model_package_data_build_fingerprint=model_package_data_build_fingerprint, - core_package_metadata=core_package_metadata, - data_package_git_sha=data_package_git_sha, - repo_id=repo_id, - repo_type=repo_type, - build_id=resolved_build_id, - created_at=manifest_timestamp, - ) - else: - manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION - manifest.setdefault("compatible_core_packages", []) - manifest.setdefault("metadata", {})["artifact_release"] = ( - _artifact_release_metadata( - repo_id=repo_id, - repo_type=repo_type, - version=version, - ) ) - manifest.setdefault("build", {}) - manifest["build"].setdefault("build_id", resolved_build_id) - manifest["build"].setdefault("built_at", manifest_timestamp) - build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) - if build_metadata: - manifest["build"].setdefault("metadata", {}).update(build_metadata) - model_package_metadata = _runtime_component_metadata( - name=model_package_name, - version=model_package_version, - git_sha=model_package_git_sha, - data_build_fingerprint=model_package_data_build_fingerprint, - core_package_metadata=core_package_metadata, - ) - if model_package_metadata is not None: - manifest["build"]["built_with_model_package"] = model_package_metadata - if core_package_metadata is not None: - manifest["build"]["built_with_core_package"] = dict(core_package_metadata) - if model_package_version: - manifest["compatible_model_packages"] = [ - { - "name": model_package_name, - "specifier": f"=={model_package_version}", - } - ] - core_version = _core_version(core_package_metadata) - if core_version: - manifest["compatible_core_packages"] = [ - { - "name": core_package_metadata.get("name", "policyengine-core"), - "specifier": f"=={core_version}", - } - ] - - if default_datasets: - manifest.setdefault("default_datasets", {}).update(default_datasets) - - for local_path, path_in_repo in files_with_repo_paths: - local_path = Path(local_path) - manifest["artifacts"][_artifact_key(path_in_repo)] = { - "kind": _artifact_kind(path_in_repo), - "uri": _artifact_uri( - repo_id=repo_id, - repo_type=repo_type, - revision=version, - path_in_repo=path_in_repo, - ), - "path": path_in_repo, - "repo_id": repo_id, - "revision": version, - "sha256": _compute_file_checksum(local_path), - "size_bytes": local_path.stat().st_size, - "metadata": { - "repo_type": repo_type, - "visibility": _artifact_visibility(repo_id), - }, - } - - defaults = manifest["default_datasets"] - if "national" not in defaults and "enhanced_frs_2023_24" in manifest["artifacts"]: - defaults["national"] = "enhanced_frs_2023_24" - if "baseline" not in defaults and "frs_2023_24" in manifest["artifacts"]: - defaults["baseline"] = "frs_2023_24" + _update_manifest_metadata( + manifest, + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + _update_build_section( + manifest, + build_id=resolved_build_id, + created_at=manifest_timestamp, + data_package_git_sha=data_package_git_sha, + model_package_name=model_package_name, + model_package_version=model_package_version, + model_package_git_sha=model_package_git_sha, + model_package_data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + ) + _update_compatibility( + manifest, + model_package_name=model_package_name, + model_package_version=model_package_version, + core_package_metadata=core_package_metadata, + ) + _update_artifacts( + manifest, + files_with_repo_paths=files_with_repo_paths, + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + _update_default_datasets( + manifest, + default_datasets=default_datasets, + ) return manifest