diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index 56d90e9f..158864a2 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -27,6 +27,25 @@ jobs: pip install "ruff>=0.9.0" - name: Check formatting run: ruff format --check . + bundle-release-manifest-contract: + name: Validate bundle release manifest contract + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: 3.14 + - name: Install uv + uses: astral-sh/setup-uv@v8.1.0 + - name: Install package + run: uv pip install --system . + - name: Install bundle validation tooling + # Pin the test-only bundle contract dependency until policyengine-bundles + # has published releases suitable for ordinary dependency specifiers. + run: uv pip install --system pytest "policyengine-bundles @ git+https://github.com/PolicyEngine/policyengine-bundles@8ae9f56fefcf89f69b8a7e3bc49928509c6207be" + - name: Validate release manifest contract + run: python -m pytest policyengine_uk_data/tests/test_release_manifest.py::test_build_release_manifest_validates_against_bundle_contract test: name: Test runs-on: ubuntu-latest diff --git a/changelog.d/add-bundle-release-manifest.changed.md b/changelog.d/add-bundle-release-manifest.changed.md new file mode 100644 index 00000000..cec2124b --- /dev/null +++ b/changelog.d/add-bundle-release-manifest.changed.md @@ -0,0 +1 @@ +Added bundle-compatible metadata to UK data release manifests. diff --git a/policyengine_uk_data/tests/test_release_manifest.py b/policyengine_uk_data/tests/test_release_manifest.py index e42e95dc..6c1f459d 100644 --- a/policyengine_uk_data/tests/test_release_manifest.py +++ b/policyengine_uk_data/tests/test_release_manifest.py @@ -1,23 +1,42 @@ import hashlib from io import BytesIO from importlib import metadata +import json from pathlib import Path from unittest.mock import MagicMock, patch import pytest from huggingface_hub import CommitOperationAdd -from huggingface_hub.errors import EntryNotFoundError +from huggingface_hub.errors import EntryNotFoundError, RevisionNotFoundError from policyengine_uk_data.utils.data_upload import ( _get_model_package_version, load_release_manifest_from_hf, upload_files_to_hf, ) +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO from policyengine_uk_data.utils.release_manifest import ( RELEASE_MANIFEST_SCHEMA_VERSION, build_release_manifest, ) +# Synthetic fixture: this verifies manifest propagation, not the package dep range. +CORE_FIXTURE_VERSION = "9.8.7" +EXPECTED_CORE_PACKAGE = { + "name": "policyengine-core", + "version": CORE_FIXTURE_VERSION, +} +EXPECTED_COMPATIBLE_CORE_PACKAGES = [ + {"name": "policyengine-core", "specifier": f"=={CORE_FIXTURE_VERSION}"} +] + + +def _missing_revision_error() -> RevisionNotFoundError: + return RevisionNotFoundError( + "missing revision", + response=MagicMock(), + ) + def _write_file(path: Path, content: bytes) -> Path: path.parent.mkdir(parents=True, exist_ok=True) @@ -29,6 +48,17 @@ def _sha256(content: bytes) -> str: return hashlib.sha256(content).hexdigest() +def _assert_single_uk_data_release_version(manifest: dict) -> None: + """UK data uses one version for package code, HF tags, and artifacts.""" + + release_version = manifest["data_package"]["version"] + assert manifest["metadata"]["artifact_release"]["version"] == release_version + + for artifact in manifest["artifacts"].values(): + assert artifact["revision"] == release_version + assert f"@{release_version}/" in artifact["uri"] + + def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): enhanced_bytes = b"enhanced-frs" baseline_bytes = b"baseline-frs" @@ -48,10 +78,12 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): (weights_path, "local_authority_weights.h5"), ], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, model_package_version="2.74.0", model_package_git_sha="deadbeef", model_package_data_build_fingerprint="sha256:fingerprint", + core_package_metadata=EXPECTED_CORE_PACKAGE, + data_package_git_sha="cafebabe", created_at="2026-04-10T12:00:00Z", ) @@ -66,16 +98,32 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): "specifier": "==2.74.0", } ] + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES assert manifest["build"] == { "build_id": "policyengine-uk-data-1.40.4", "built_at": "2026-04-10T12:00:00Z", + "metadata": { + "data_package_git_sha": "cafebabe", + }, "built_with_model_package": { "name": "policyengine-uk", "version": "2.74.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, }, + "built_with_core_package": EXPECTED_CORE_PACKAGE, } + assert "created_at" not in manifest + assert manifest["metadata"] == { + "artifact_release": { + "repo_id": PRIVATE_REPO, + "repo_type": "model", + "version": "1.40.4", + "visibility": "private", + } + } + _assert_single_uk_data_release_version(manifest) assert manifest["default_datasets"] == { "national": "enhanced_frs_2023_24", "baseline": "frs_2023_24", @@ -86,6 +134,49 @@ def test_build_release_manifest_tracks_uk_release_artifacts(tmp_path): ) assert manifest["artifacts"]["frs_2023_24"]["sha256"] == _sha256(baseline_bytes) assert manifest["artifacts"]["local_authority_weights"]["kind"] == "weights" + assert manifest["artifacts"]["enhanced_frs_2023_24"]["uri"] == ( + f"hf://model/{PRIVATE_REPO}@1.40.4/enhanced_frs_2023_24.h5" + ) + assert manifest["artifacts"]["enhanced_frs_2023_24"]["metadata"] == { + "repo_type": "model", + "visibility": "private", + } + + +def test_build_release_manifest_validates_against_bundle_contract(tmp_path): + policyengine_bundles = pytest.importorskip("policyengine_bundles") + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + manifest = build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id=PRIVATE_REPO, + model_package_version="2.74.0", + model_package_git_sha="deadbeef", + model_package_data_build_fingerprint="sha256:fingerprint", + core_package_metadata=EXPECTED_CORE_PACKAGE, + data_package_git_sha="cafebabe", + created_at="2026-04-10T12:00:00Z", + ) + + policyengine_bundles.DataReleaseManifest.model_validate(manifest) + + +def test_build_release_manifest_rejects_unknown_hf_repo(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + with pytest.raises(ValueError, match="Unknown UK data Hugging Face repo"): + build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id="policyengine/policyengine-uk-data-private-copy", + ) def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_retry( @@ -99,7 +190,7 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re manifest = build_release_manifest( files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], version="1.40.4", - repo_id="policyengine/policyengine-uk-data-private", + repo_id=PRIVATE_REPO, model_package_version="9.99.9", existing_manifest={ "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, @@ -113,6 +204,7 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re "specifier": "==1.0.0", } ], + "compatible_core_packages": [], "default_datasets": {}, "created_at": "2026-04-10T12:00:00Z", "artifacts": {}, @@ -122,6 +214,55 @@ def test_build_release_manifest_refreshes_compatible_model_packages_for_draft_re assert manifest["compatible_model_packages"] == [ {"name": "policyengine-uk", "specifier": "==9.99.9"} ] + _assert_single_uk_data_release_version(manifest) + + +def test_build_release_manifest_refreshes_draft_artifact_release_version(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + + manifest = build_release_manifest( + files_with_repo_paths=[(dataset_path, "enhanced_frs_2023_24.h5")], + version="1.40.4", + repo_id=PRIVATE_REPO, + existing_manifest={ + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [], + "compatible_core_packages": [], + "default_datasets": {}, + "metadata": { + "artifact_release": { + "repo_id": PRIVATE_REPO, + "repo_type": "model", + "version": "stale-draft-version", + "visibility": "private", + } + }, + "artifacts": { + "enhanced_frs_2023_24": { + "kind": "microdata", + "uri": f"hf://model/{PRIVATE_REPO}@stale-draft-version/enhanced_frs_2023_24.h5", + "path": "enhanced_frs_2023_24.h5", + "repo_id": PRIVATE_REPO, + "revision": "stale-draft-version", + "sha256": "stale", + "size_bytes": 5, + "metadata": { + "repo_type": "model", + "visibility": "private", + }, + } + }, + }, + ) + + _assert_single_uk_data_release_version(manifest) def test_load_release_manifest_from_hf_raises_non_missing_download_errors(): @@ -149,6 +290,37 @@ def test_load_release_manifest_from_hf_continues_on_missing_entry(tmp_path): assert manifest["data_package"]["version"] == "1.40.4" +def test_load_release_manifest_from_hf_uses_explicit_revision_when_requested(tmp_path): + manifest_path = tmp_path / "release_manifest.json" + manifest_path.write_text('{"data_package": {"version": "1.40.4"}}') + + with patch( + "policyengine_uk_data.utils.data_upload.hf_hub_download", + return_value=str(manifest_path), + ) as mock_download: + manifest = load_release_manifest_from_hf( + version="1.40.4", + revision="1.40.4", + ) + + assert manifest["data_package"]["version"] == "1.40.4" + assert mock_download.call_args.kwargs["revision"] == "1.40.4" + + +def test_load_release_manifest_from_hf_returns_none_when_revision_is_missing(): + with patch( + "policyengine_uk_data.utils.data_upload.hf_hub_download", + side_effect=_missing_revision_error(), + ): + assert ( + load_release_manifest_from_hf( + version="1.40.4", + revision="1.40.4", + ) + is None + ) + + def test_get_model_package_version_prefers_imported_checkout(tmp_path): package_root = tmp_path / "policyengine_uk" package_root.mkdir() @@ -179,6 +351,7 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): mock_api = MagicMock() mock_api.create_commit.return_value = MagicMock(oid="commit-sha") + mock_api.repo_info.side_effect = _missing_revision_error() with ( patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), @@ -192,8 +365,13 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): "version": "2.74.0", "git_sha": "deadbeef", "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, }, ), + patch( + "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", + return_value="cafebabe", + ), patch.dict( "policyengine_uk_data.utils.data_upload.os.environ", {"HUGGING_FACE_TOKEN": "token"}, @@ -221,3 +399,164 @@ def test_upload_files_to_hf_adds_uk_release_manifest_operations(tmp_path): for operation in release_ops: assert isinstance(operation, CommitOperationAdd) assert isinstance(operation.path_or_fileobj, BytesIO) + + payload = release_ops[0].path_or_fileobj.getvalue() + manifest = json.loads(payload.decode("utf-8")) + _assert_single_uk_data_release_version(manifest) + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES + assert manifest["build"]["built_with_core_package"] == EXPECTED_CORE_PACKAGE + assert manifest["build"]["metadata"] == { + "data_package_git_sha": "cafebabe", + } + assert ( + manifest["build"]["built_with_model_package"]["core"] == EXPECTED_CORE_PACKAGE + ) + + +def test_upload_files_to_hf_refreshes_same_version_unfinalized_manifest(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs-v2", + ) + existing_manifest = { + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [ + { + "name": "policyengine-uk", + "specifier": "==2.0.0", + } + ], + "compatible_core_packages": [], + "default_datasets": {}, + "created_at": "2026-04-10T12:00:00Z", + "artifacts": {}, + } + mock_api = MagicMock() + mock_api.create_commit.return_value = MagicMock(oid="commit-sha") + mock_api.repo_info.side_effect = _missing_revision_error() + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + side_effect=lambda *args, **kwargs: ( + None if kwargs.get("revision") == "1.40.4" else existing_manifest + ), + ), + patch( + "policyengine_uk_data.utils.data_upload._get_model_package_build_metadata", + return_value={ + "version": "2.74.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:fingerprint", + "core": EXPECTED_CORE_PACKAGE, + }, + ), + patch( + "policyengine_uk_data.utils.data_upload._get_data_package_git_sha", + return_value="cafebabe", + ), + ): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) + + operations = mock_api.create_commit.call_args.kwargs["operations"] + release_op = next( + operation + for operation in operations + if operation.path_in_repo == "release_manifest.json" + ) + manifest = json.loads(release_op.path_or_fileobj.getvalue().decode("utf-8")) + + assert "created_at" not in manifest + _assert_single_uk_data_release_version(manifest) + assert manifest["compatible_model_packages"] == [ + {"name": "policyengine-uk", "specifier": "==2.74.0"} + ] + assert manifest["compatible_core_packages"] == EXPECTED_COMPATIBLE_CORE_PACKAGES + assert manifest["build"]["metadata"] == { + "data_package_git_sha": "cafebabe", + } + assert manifest["build"]["built_with_core_package"] == EXPECTED_CORE_PACKAGE + assert manifest["artifacts"]["enhanced_frs_2023_24"]["sha256"] == _sha256( + b"enhanced-frs-v2" + ) + + +def test_upload_files_to_hf_rejects_finalized_release(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + finalized_manifest = { + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + }, + "compatible_model_packages": [ + {"name": "policyengine-uk", "specifier": "==2.74.0"} + ], + "default_datasets": {"national": "enhanced_frs_2023_24"}, + "artifacts": { + "enhanced_frs_2023_24": { + "kind": "microdata", + "path": "enhanced_frs_2023_24.h5", + "repo_id": PRIVATE_REPO, + "revision": "1.40.4", + "sha256": _sha256(b"enhanced-frs"), + "size_bytes": len(b"enhanced-frs"), + } + }, + } + mock_api = MagicMock() + mock_api.repo_info.side_effect = _missing_revision_error() + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + side_effect=lambda *args, **kwargs: ( + finalized_manifest if kwargs.get("revision") == "1.40.4" else None + ), + ), + ): + with pytest.raises(RuntimeError, match="already finalized"): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) + + mock_api.create_commit.assert_not_called() + + +def test_upload_files_to_hf_rejects_existing_tag_before_commit(tmp_path): + dataset_path = _write_file( + tmp_path / "enhanced_frs_2023_24.h5", + b"enhanced-frs", + ) + mock_api = MagicMock() + mock_api.repo_info.return_value = MagicMock(sha="old-commit") + + with ( + patch("policyengine_uk_data.utils.data_upload.HfApi", return_value=mock_api), + patch( + "policyengine_uk_data.utils.data_upload.load_release_manifest_from_hf", + return_value=None, + ) as mock_load_release_manifest, + ): + with pytest.raises(RuntimeError, match="already finalized"): + upload_files_to_hf( + files=[dataset_path], + version="1.40.4", + ) + + mock_load_release_manifest.assert_not_called() + mock_api.create_commit.assert_not_called() + mock_api.create_tag.assert_not_called() diff --git a/policyengine_uk_data/utils/data_upload.py b/policyengine_uk_data/utils/data_upload.py index 03040d51..8c54f7c9 100644 --- a/policyengine_uk_data/utils/data_upload.py +++ b/policyengine_uk_data/utils/data_upload.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple from huggingface_hub import HfApi, CommitOperationAdd, hf_hub_download from huggingface_hub.errors import EntryNotFoundError, RevisionNotFoundError from google.cloud import storage @@ -10,14 +10,17 @@ import json import logging import os +import subprocess import tomllib from policyengine_uk_data.utils.release_manifest import ( build_release_manifest, serialize_release_manifest, ) +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO, PUBLIC_REPO RELEASE_MANIFEST_PATH = "release_manifest.json" +REPO_ROOT = Path(__file__).resolve().parents[2] def _get_model_package_version( @@ -49,23 +52,27 @@ def _get_model_package_version( def _get_model_package_build_metadata( package_name: str = "policyengine-uk", -) -> Dict[str, Optional[str]]: - metadata_payload: Dict[str, Optional[str]] = { +) -> Dict[str, Any]: + metadata_payload: Dict[str, Any] = { "version": _get_model_package_version(package_name), "git_sha": None, "data_build_fingerprint": None, + "core": None, } module_name = package_name.replace("-", "_") try: build_metadata_module = __import__( f"{module_name}.build_metadata", - fromlist=["get_data_build_metadata"], + fromlist=["get_runtime_metadata", "get_data_build_metadata"], ) - get_data_build_metadata = getattr( - build_metadata_module, "get_data_build_metadata", None - ) - if callable(get_data_build_metadata): - package_metadata = get_data_build_metadata() + for metadata_getter_name in ( + "get_runtime_metadata", + "get_data_build_metadata", + ): + metadata_getter = getattr(build_metadata_module, metadata_getter_name, None) + if not callable(metadata_getter): + continue + package_metadata = metadata_getter() metadata_payload["version"] = ( package_metadata.get("version") or metadata_payload["version"] ) @@ -73,6 +80,8 @@ def _get_model_package_build_metadata( metadata_payload["data_build_fingerprint"] = package_metadata.get( "data_build_fingerprint" ) + metadata_payload["core"] = package_metadata.get("core") + break except Exception: logging.warning( "Could not load build metadata from %s while building release manifest.", @@ -82,10 +91,55 @@ def _get_model_package_build_metadata( return metadata_payload +def _get_core_package_runtime_metadata( + package_name: str = "policyengine-core", +) -> Dict[str, Any] | None: + try: + from policyengine_core import get_runtime_metadata + + return dict(get_runtime_metadata()) + except (AttributeError, ImportError): + pass + except Exception: + logging.warning( + "Could not load runtime metadata from %s while building release manifest.", + package_name, + exc_info=True, + ) + + try: + return { + "name": package_name, + "version": metadata.version(package_name), + } + except metadata.PackageNotFoundError: + logging.warning( + "Could not determine installed version for %s while building release manifest.", + package_name, + ) + return None + + +def _get_data_package_git_sha() -> str | None: + try: + return subprocess.check_output( + ["git", "-C", str(REPO_ROOT), "rev-parse", "HEAD"], + stderr=subprocess.DEVNULL, + text=True, + ).strip() + except Exception: + logging.warning( + "Could not determine policyengine-uk-data git SHA while building release manifest.", + exc_info=True, + ) + return None + + def load_release_manifest_from_hf( version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", + revision: Optional[str] = None, ) -> Optional[Dict]: token = os.environ.get("HUGGING_FACE_TOKEN") candidate_paths = [ @@ -100,8 +154,11 @@ def load_release_manifest_from_hf( filename=path_in_repo, repo_type=hf_repo_type, token=token, + revision=revision, ) except RevisionNotFoundError: + if revision is not None: + return None raise except EntryNotFoundError: continue @@ -116,24 +173,134 @@ def load_release_manifest_from_hf( return None +def _get_release_tag_revision( + version: str, + hf_repo_name: str = PRIVATE_REPO, + hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, +) -> Optional[str]: + api = api or HfApi() + token = token or os.environ.get("HUGGING_FACE_TOKEN") + try: + repo_info = api.repo_info( + repo_id=hf_repo_name, + repo_type=hf_repo_type, + revision=version, + token=token, + ) + return getattr(repo_info, "sha", None) or "" + except RevisionNotFoundError: + return None + + +def assert_release_not_finalized( + version: str, + hf_repo_name: str = PRIVATE_REPO, + hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, +) -> None: + tagged_revision = _get_release_tag_revision( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) + if tagged_revision is not None: + raise RuntimeError( + f"Release {version} is already finalized on {hf_repo_name} at " + f"{tagged_revision}. Refusing to mutate release manifest state " + "after the tag exists." + ) + + finalized_manifest = load_release_manifest_from_hf( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + revision=version, + ) + if finalized_manifest is not None: + raise RuntimeError( + f"Release {version} is already finalized on {hf_repo_name}. " + "Refusing to mutate release manifest state after the tag exists." + ) + + +def create_release_tag( + version: str, + revision: str, + hf_repo_name: str = PRIVATE_REPO, + hf_repo_type: str = "model", + token: Optional[str] = None, + api: Optional[HfApi] = None, +) -> None: + api = api or HfApi() + token = token or os.environ.get("HUGGING_FACE_TOKEN") + try: + api.create_tag( + token=token, + repo_id=hf_repo_name, + tag=version, + revision=revision, + repo_type=hf_repo_type, + exist_ok=False, + ) + logging.info( + "Tagged revision %s with %s in Hugging Face repository %s.", + revision, + version, + hf_repo_name, + ) + except Exception as e: + if "Tag reference exists already" in str(e) or "409" in str(e): + tagged_revision = _get_release_tag_revision( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) + if tagged_revision == revision: + logging.info( + "Tag %s already exists in %s and already points to %s.", + version, + hf_repo_name, + revision, + ) + return + raise RuntimeError( + f"Tag {version} already exists in {hf_repo_name} at " + f"{tagged_revision}; refusing to treat {revision} as finalized." + ) from e + raise + + def create_release_manifest_commit_operations( files_with_repo_paths: List[Tuple[Path, str]], version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, + hf_repo_type: str = "model", model_package_name: str = "policyengine-uk", model_package_version: Optional[str] = None, model_package_git_sha: Optional[str] = None, model_package_data_build_fingerprint: Optional[str] = None, + core_package_metadata: Optional[Mapping[str, Any]] = None, + data_package_git_sha: Optional[str] = None, existing_manifest: Optional[Dict] = None, ) -> Tuple[Dict, List[CommitOperationAdd]]: manifest = build_release_manifest( files_with_repo_paths=files_with_repo_paths, version=version, repo_id=hf_repo_name, + repo_type=hf_repo_type, model_package_name=model_package_name, model_package_version=model_package_version, model_package_git_sha=model_package_git_sha, model_package_data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + data_package_git_sha=data_package_git_sha, existing_manifest=existing_manifest, ) manifest_payload = serialize_release_manifest(manifest) @@ -153,7 +320,7 @@ def create_release_manifest_commit_operations( def upload_data_files( files: List[str], gcs_bucket_name: str = "policyengine-uk-data-private", - hf_repo_name: str = "policyengine/policyengine-uk-data", + hf_repo_name: str = PUBLIC_REPO, hf_repo_type: str = "model", version: str = None, ): @@ -177,7 +344,7 @@ def upload_data_files( def upload_files_to_hf( files: List[str], version: str, - hf_repo_name: str = "policyengine/policyengine-uk-data-private", + hf_repo_name: str = PRIVATE_REPO, hf_repo_type: str = "model", ): """ @@ -187,6 +354,13 @@ def upload_files_to_hf( token = os.environ.get( "HUGGING_FACE_TOKEN", ) + assert_release_not_finalized( + version=version, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) hf_operations = [] files_with_repo_paths = [] @@ -209,15 +383,21 @@ def upload_files_to_hf( hf_repo_type=hf_repo_type, ) model_build_metadata = _get_model_package_build_metadata() + core_package_metadata = ( + model_build_metadata.get("core") or _get_core_package_runtime_metadata() + ) _, manifest_operations = create_release_manifest_commit_operations( files_with_repo_paths=files_with_repo_paths, version=version, hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, model_package_version=model_build_metadata["version"], model_package_git_sha=model_build_metadata["git_sha"], model_package_data_build_fingerprint=model_build_metadata[ "data_build_fingerprint" ], + core_package_metadata=core_package_metadata, + data_package_git_sha=_get_data_package_git_sha(), existing_manifest=existing_manifest, ) hf_operations.extend(manifest_operations) @@ -231,25 +411,14 @@ def upload_files_to_hf( ) logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.") - # Tag commit with version - try: - api.create_tag( - token=token, - repo_id=hf_repo_name, - tag=version, - revision=commit_info.oid, - repo_type=hf_repo_type, - ) - logging.info( - f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}." - ) - except Exception as e: - if "Tag reference exists already" in str(e) or "409" in str(e): - logging.warning( - f"Tag {version} already exists in {hf_repo_name}. Skipping tag creation." - ) - else: - raise + create_release_tag( + version=version, + revision=commit_info.oid, + hf_repo_name=hf_repo_name, + hf_repo_type=hf_repo_type, + token=token, + api=api, + ) def upload_files_to_gcs( diff --git a/policyengine_uk_data/utils/release_manifest.py b/policyengine_uk_data/utils/release_manifest.py index f72ed944..e9eb90b0 100644 --- a/policyengine_uk_data/utils/release_manifest.py +++ b/policyengine_uk_data/utils/release_manifest.py @@ -5,7 +5,9 @@ import hashlib import json from pathlib import Path, PurePosixPath -from typing import Dict, Mapping, Optional, Sequence, Tuple +from typing import Any, Dict, Mapping, Optional, Sequence, Tuple + +from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO, PUBLIC_REPO RELEASE_MANIFEST_SCHEMA_VERSION = 1 @@ -38,51 +40,257 @@ def _artifact_kind(path_in_repo: str) -> str: return "auxiliary" -def _base_manifest( +def _artifact_uri( + *, + repo_id: str, + repo_type: str, + revision: str, + path_in_repo: str, +) -> str: + return f"hf://{repo_type}/{repo_id}@{revision}/{path_in_repo}" + + +def _artifact_visibility(repo_id: str) -> str: + if repo_id == PRIVATE_REPO: + return "private" + if repo_id == PUBLIC_REPO: + return "public" + raise ValueError( + f"Unknown UK data Hugging Face repo {repo_id!r}; use " + "PRIVATE_REPO or PUBLIC_REPO." + ) + + +def _artifact_release_metadata( *, + repo_id: str, + repo_type: str, version: str, - data_package_name: str, +) -> Dict[str, str]: + # UK data uses one release coordinate across package code, HF tags, and + # published dataset artifacts. Do not treat this as a separate artifact version. + return { + "repo_id": repo_id, + "repo_type": repo_type, + "version": version, + "visibility": _artifact_visibility(repo_id), + } + + +def _without_none_values(payload: Mapping[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in payload.items() if value is not None} + + +def _runtime_component_metadata( + *, + name: str, + version: str | None, + git_sha: str | None = None, + data_build_fingerprint: str | None = None, + core_package_metadata: Mapping[str, Any] | None = None, +) -> Dict[str, Any] | None: + if version is None: + return None + + metadata = _without_none_values( + { + "name": name, + "version": version, + "git_sha": git_sha, + "data_build_fingerprint": data_build_fingerprint, + } + ) + if core_package_metadata is not None: + metadata["core"] = dict(core_package_metadata) + return metadata + + +def _build_metadata( + *, + data_package_git_sha: str | None, +) -> Dict[str, Any]: + return _without_none_values( + { + "data_package_git_sha": data_package_git_sha, + } + ) + + +def _core_version(core_package_metadata: Mapping[str, Any] | None) -> str | None: + if core_package_metadata is None: + return None + version = core_package_metadata.get("version") + return version if isinstance(version, str) and version else None + + +def _model_package_compatibility( + *, model_package_name: str, model_package_version: str | None, - model_package_git_sha: str | None, - model_package_data_build_fingerprint: str | None, - build_id: str, - created_at: str, +) -> list[Dict[str, str]]: + if not model_package_version: + return [] + return [ + { + "name": model_package_name, + "specifier": f"=={model_package_version}", + } + ] + + +def _core_package_compatibility( + *, + core_package_metadata: Mapping[str, Any] | None, +) -> list[Dict[str, str]]: + core_version = _core_version(core_package_metadata) + if not core_version or core_package_metadata is None: + return [] + return [ + { + "name": core_package_metadata.get("name", "policyengine-core"), + "specifier": f"=={core_version}", + } + ] + + +def _new_release_manifest( + *, + version: str, + data_package_name: str, ) -> Dict: - manifest = { + return { "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, "data_package": { "name": data_package_name, "version": version, }, "compatible_model_packages": [], + "compatible_core_packages": [], "default_datasets": {}, - "created_at": created_at, - "build": { - "build_id": build_id, - "built_at": created_at, - }, + "build": {}, "artifacts": {}, + "metadata": {}, } - if ( - model_package_version - or model_package_git_sha - or model_package_data_build_fingerprint - ): - manifest["build"]["built_with_model_package"] = { - "name": model_package_name, - "version": model_package_version, - "git_sha": model_package_git_sha, - "data_build_fingerprint": model_package_data_build_fingerprint, - } - if model_package_version: - manifest["compatible_model_packages"].append( - { - "name": model_package_name, - "specifier": f"=={model_package_version}", - } + + +def _update_manifest_metadata( + manifest: Dict, + *, + repo_id: str, + repo_type: str, + version: str, +) -> None: + manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION + manifest.setdefault("metadata", {})["artifact_release"] = ( + _artifact_release_metadata( + repo_id=repo_id, + repo_type=repo_type, + version=version, ) - return manifest + ) + + +def _update_build_section( + manifest: Dict, + *, + build_id: str, + created_at: str, + data_package_git_sha: str | None, + model_package_name: str, + model_package_version: str | None, + model_package_git_sha: str | None, + model_package_data_build_fingerprint: str | None, + core_package_metadata: Mapping[str, Any] | None, +) -> None: + build = manifest.setdefault("build", {}) + build.setdefault("build_id", build_id) + build.setdefault("built_at", created_at) + + build_metadata = _build_metadata(data_package_git_sha=data_package_git_sha) + if build_metadata: + build.setdefault("metadata", {}).update(build_metadata) + + model_package_metadata = _runtime_component_metadata( + name=model_package_name, + version=model_package_version, + git_sha=model_package_git_sha, + data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + ) + if model_package_metadata is not None: + build["built_with_model_package"] = model_package_metadata + if core_package_metadata is not None: + build["built_with_core_package"] = dict(core_package_metadata) + + +def _update_compatibility( + manifest: Dict, + *, + model_package_name: str, + model_package_version: str | None, + core_package_metadata: Mapping[str, Any] | None, +) -> None: + manifest.setdefault("compatible_model_packages", []) + model_package_compatibility = _model_package_compatibility( + model_package_name=model_package_name, + model_package_version=model_package_version, + ) + if model_package_compatibility: + manifest["compatible_model_packages"] = model_package_compatibility + + manifest.setdefault("compatible_core_packages", []) + core_package_compatibility = _core_package_compatibility( + core_package_metadata=core_package_metadata, + ) + if core_package_compatibility: + manifest["compatible_core_packages"] = core_package_compatibility + + +def _update_artifacts( + manifest: Dict, + *, + files_with_repo_paths: Sequence[Tuple[Path | str, str]], + repo_id: str, + repo_type: str, + version: str, +) -> None: + artifacts = manifest.setdefault("artifacts", {}) + for local_path, path_in_repo in files_with_repo_paths: + local_path = Path(local_path) + artifacts[_artifact_key(path_in_repo)] = { + "kind": _artifact_kind(path_in_repo), + "uri": _artifact_uri( + repo_id=repo_id, + repo_type=repo_type, + revision=version, + path_in_repo=path_in_repo, + ), + "path": path_in_repo, + "repo_id": repo_id, + "revision": version, + "sha256": _compute_file_checksum(local_path), + "size_bytes": local_path.stat().st_size, + "metadata": { + "repo_type": repo_type, + "visibility": _artifact_visibility(repo_id), + }, + } + + +def _update_default_datasets( + manifest: Dict, + *, + default_datasets: Optional[Mapping[str, str]], +) -> None: + defaults = manifest.setdefault("default_datasets", {}) + if default_datasets: + defaults.update(default_datasets) + if "national" not in defaults and "enhanced_frs_2023_24" in manifest.get( + "artifacts", {} + ): + defaults["national"] = "enhanced_frs_2023_24" + if "baseline" not in defaults and "frs_2023_24" in manifest.get("artifacts", {}): + defaults["baseline"] = "frs_2023_24" def _normalize_existing_manifest( @@ -96,7 +304,9 @@ def _normalize_existing_manifest( package = existing_manifest.get("data_package", {}) if package.get("name") != data_package_name or package.get("version") != version: return None - return deepcopy(dict(existing_manifest)) + manifest = deepcopy(dict(existing_manifest)) + manifest.pop("created_at", None) + return manifest def build_release_manifest( @@ -104,11 +314,14 @@ def build_release_manifest( files_with_repo_paths: Sequence[Tuple[Path | str, str]], version: str, repo_id: str, + repo_type: str = "model", data_package_name: str = "policyengine-uk-data", model_package_name: str = "policyengine-uk", model_package_version: str | None = None, model_package_git_sha: str | None = None, model_package_data_build_fingerprint: str | None = None, + core_package_metadata: Optional[Mapping[str, Any]] = None, + data_package_git_sha: str | None = None, build_id: str | None = None, existing_manifest: Mapping | None = None, default_datasets: Optional[Mapping[str, str]] = None, @@ -123,61 +336,45 @@ def build_release_manifest( resolved_build_id = build_id or f"{data_package_name}-{version}" if manifest is None: - manifest = _base_manifest( + manifest = _new_release_manifest( version=version, data_package_name=data_package_name, - model_package_name=model_package_name, - model_package_version=model_package_version, - model_package_git_sha=model_package_git_sha, - model_package_data_build_fingerprint=model_package_data_build_fingerprint, - build_id=resolved_build_id, - created_at=manifest_timestamp, ) - else: - manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION - manifest["created_at"] = manifest_timestamp - manifest.setdefault("build", {}) - manifest["build"].setdefault("build_id", resolved_build_id) - manifest["build"].setdefault("built_at", manifest_timestamp) - if ( - model_package_version - or model_package_git_sha - or model_package_data_build_fingerprint - ): - manifest["build"]["built_with_model_package"] = { - "name": model_package_name, - "version": model_package_version, - "git_sha": model_package_git_sha, - "data_build_fingerprint": model_package_data_build_fingerprint, - } - if model_package_version: - manifest["compatible_model_packages"] = [ - { - "name": model_package_name, - "specifier": f"=={model_package_version}", - } - ] - - if default_datasets: - manifest.setdefault("default_datasets", {}).update(default_datasets) - - for local_path, path_in_repo in files_with_repo_paths: - local_path = Path(local_path) - manifest["artifacts"][_artifact_key(path_in_repo)] = { - "kind": _artifact_kind(path_in_repo), - "path": path_in_repo, - "repo_id": repo_id, - "revision": version, - "sha256": _compute_file_checksum(local_path), - "size_bytes": local_path.stat().st_size, - } - - defaults = manifest["default_datasets"] - if "national" not in defaults and "enhanced_frs_2023_24" in manifest["artifacts"]: - defaults["national"] = "enhanced_frs_2023_24" - if "baseline" not in defaults and "frs_2023_24" in manifest["artifacts"]: - defaults["baseline"] = "frs_2023_24" + _update_manifest_metadata( + manifest, + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + _update_build_section( + manifest, + build_id=resolved_build_id, + created_at=manifest_timestamp, + data_package_git_sha=data_package_git_sha, + model_package_name=model_package_name, + model_package_version=model_package_version, + model_package_git_sha=model_package_git_sha, + model_package_data_build_fingerprint=model_package_data_build_fingerprint, + core_package_metadata=core_package_metadata, + ) + _update_compatibility( + manifest, + model_package_name=model_package_name, + model_package_version=model_package_version, + core_package_metadata=core_package_metadata, + ) + _update_artifacts( + manifest, + files_with_repo_paths=files_with_repo_paths, + repo_id=repo_id, + repo_type=repo_type, + version=version, + ) + _update_default_datasets( + manifest, + default_datasets=default_datasets, + ) return manifest