Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions changelog.d/small-follow-ups.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Two small follow-ups surfaced by the v4 audit:

- ``Simulation.save()`` now raises a helpful ``ValueError`` when ``output_dataset`` is still ``None`` (run or ensure was never called), instead of the raw ``AttributeError: 'NoneType' object has no attribute 'save'``.
- New ``scripts/check_data_staleness.py`` prints a one-line verdict per country comparing the bundled release manifest's pinned model version against the installed country package; exits non-zero when any country is stale. Drop into CI to get automated "bundle is behind" alerts without waiting for a human to notice.

Also clarified the ``load()`` docstring: ``created_at``/``updated_at`` are filesystem ``ctime``/``mtime`` approximations, not a true round-trip of the original timestamps.
80 changes: 80 additions & 0 deletions scripts/check_data_staleness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Flag when the bundled country release manifest falls behind upstream.

Each bundled country release manifest pins the country-model version the
certified microdata was built with. If that pin drifts behind the currently
installed country-model version, downstream calculations may diverge from what
the data was calibrated against. This script surfaces that drift so a CI job
or a human can decide whether to kick off a data rebuild.

Outputs a single-line verdict per country, plus a summary. Exits non-zero when
any country is stale so CI can gate on it.

No network access: reads both sides locally via ``importlib.metadata`` and the
packaged release manifest loader.
"""

from __future__ import annotations

import sys
from importlib import metadata
from importlib.util import find_spec

from policyengine.provenance.manifest import (
CountryReleaseManifest,
get_release_manifest,
)

COUNTRIES = ("us", "uk")


def load_release_manifest(country: str) -> CountryReleaseManifest:
return get_release_manifest(country)


def check_country(country: str) -> tuple[str, bool]:
"""Return ``(one-line-verdict, is_stale)``."""
manifest = load_release_manifest(country)

pkg = manifest.model_package.name
pinned = manifest.model_package.version

if find_spec(pkg.replace("-", "_")) is None:
return (
f"{country}: {pkg} not installed; skipping staleness check",
False,
)

try:
installed = metadata.version(pkg)
except metadata.PackageNotFoundError:
return (
f"{country}: {pkg} importable but package metadata not found; "
"skipping staleness check",
False,
)

if installed == pinned:
return (
f"{country}: ok ({pkg} {installed} matches bundle pin)",
False,
)
return (
f"{country}: STALE ({pkg} installed={installed}, "
f"bundle pin={pinned}; consider a release-bundle refresh)",
True,
)


def main() -> int:
verdicts = []
any_stale = False
for country in COUNTRIES:
verdict, stale = check_country(country)
verdicts.append(verdict)
any_stale = any_stale or stale
print("\n".join(verdicts))
return 1 if any_stale else 0


if __name__ == "__main__":
sys.exit(main())
12 changes: 10 additions & 2 deletions src/policyengine/tax_benefit_models/common/model_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,21 @@ def resolve_entity_variables(

def save(self, simulation: Simulation) -> None:
"""Persist the simulation's output dataset to its bundled filepath."""
if simulation.output_dataset is None:
raise ValueError(
"Simulation.save() called with no output_dataset. Run "
"simulation.run() or simulation.ensure() first so there is "
"something to persist."
)
simulation.output_dataset.save()

def load(self, simulation: Simulation) -> None:
"""Rehydrate the simulation's output dataset from disk.

Loads timestamps from filesystem metadata when the file exists so
serialised simulations round-trip ``created_at``/``updated_at``.
Filesystem ``ctime``/``mtime`` on the output file are mirrored onto
``simulation.created_at`` / ``updated_at`` on load. These fields
are not written back on ``save()``, so they're filesystem
approximations, not a true round-trip of the original timestamps.
"""
filepath = str(
Path(simulation.dataset.filepath).parent / (simulation.id + ".h5")
Expand Down
76 changes: 76 additions & 0 deletions tests/test_small_follow_ups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Regression tests for small follow-up fixes from the audit."""

from __future__ import annotations

import subprocess
import sys
from pathlib import Path

import pytest

REPO_ROOT = Path(__file__).resolve().parent.parent


def test__save_with_no_output_dataset_raises_helpfully() -> None:
"""Before: raw ``AttributeError`` from ``None.save()``."""
pytest.importorskip("policyengine_us")

import policyengine as pe
from policyengine.core import Simulation
from tests.fixtures.filtering_fixtures import create_us_test_dataset

sim = Simulation(
dataset=create_us_test_dataset(),
tax_benefit_model_version=pe.us.model,
)
assert sim.output_dataset is None

with pytest.raises(ValueError, match="no output_dataset"):
sim.save()


def test__check_data_staleness_script_runs_and_reports_status() -> None:
"""The script prints one verdict line per configured country."""
pytest.importorskip("policyengine_us")

result = subprocess.run(
[sys.executable, str(REPO_ROOT / "scripts" / "check_data_staleness.py")],
capture_output=True,
text=True,
check=False,
)
lines = [line for line in result.stdout.splitlines() if line.strip()]
assert any(line.startswith("us:") for line in lines), result.stdout
assert any(line.startswith("uk:") for line in lines), result.stdout
if result.returncode != 0:
assert "STALE" in result.stdout, (
f"Non-zero exit but no STALE report: {result.stdout!r} / {result.stderr!r}"
)


def test__check_data_staleness_script_detects_drift() -> None:
"""Directly exercise the drift path without reinstalling packages."""
pytest.importorskip("policyengine_us")

import importlib.util

spec = importlib.util.spec_from_file_location(
"check_data_staleness",
REPO_ROOT / "scripts" / "check_data_staleness.py",
)
assert spec is not None and spec.loader is not None
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

real_manifest = module.load_release_manifest("us").model_copy(deep=True)
real_manifest.model_package.version = "0.0.0-drift"

def drifted_manifest(country: str):
return real_manifest

module.load_release_manifest = drifted_manifest

verdict, stale = module.check_country("us")
assert stale is True
assert "STALE" in verdict
assert "0.0.0-drift" in verdict
Loading