diff --git a/changelog.d/harden-pregnancy-takeup.fixed.md b/changelog.d/harden-pregnancy-takeup.fixed.md new file mode 100644 index 000000000..83b67f2d5 --- /dev/null +++ b/changelog.d/harden-pregnancy-takeup.fixed.md @@ -0,0 +1 @@ +Harden CPS pregnancy take-up rates to use the build year and fall back across nearby CDC and ACS vintages. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 03830647b..baee98234 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -587,7 +587,10 @@ def add_takeup(self): get_state_pregnancy_rates, ) - pregnancy_rates = get_state_pregnancy_rates() + pregnancy_rates = get_state_pregnancy_rates( + cdc_year=self.time_period, + acs_year=self.time_period, + ) national_rate = 0.041 # fallback pregnancy_rate_by_person = np.array( [pregnancy_rates.get(s, national_rate) for s in person_states] diff --git a/policyengine_us_data/db/etl_pregnancy.py b/policyengine_us_data/db/etl_pregnancy.py index 3e4326184..2ef05e177 100644 --- a/policyengine_us_data/db/etl_pregnancy.py +++ b/policyengine_us_data/db/etl_pregnancy.py @@ -30,6 +30,7 @@ ) from policyengine_us_data.utils.census import STATE_ABBREV_TO_FIPS from policyengine_us_data.utils.db import ( + DEFAULT_YEAR, get_geographic_strata, etl_argparser, ) @@ -395,8 +396,8 @@ def load_pregnancy_data( def get_state_pregnancy_rates( - cdc_year: int = 2023, - acs_year: int = 2023, + cdc_year: int = DEFAULT_YEAR, + acs_year: int = DEFAULT_YEAR, ) -> dict: """Return {state_abbrev: pregnancy_rate} for use by cps.py. @@ -413,8 +414,39 @@ def get_state_pregnancy_rates( rate (probability that a woman aged 15-44 is currently pregnant). """ - births_df = extract_cdc_births(cdc_year) - pop_df = extract_female_population(acs_year) + births_df = None + birth_errors = [] + for candidate_cdc_year in [cdc_year, cdc_year - 1]: + try: + births_df = extract_cdc_births(candidate_cdc_year) + break + except Exception as e: + birth_errors.append(f"{candidate_cdc_year}: {e}") + logger.warning( + f"CDC VSRR {candidate_cdc_year} not available for take-up: {e}" + ) + if births_df is None: + raise RuntimeError( + "No CDC VSRR birth data for pregnancy take-up rates. " + f"Tried {cdc_year} and {cdc_year - 1}: {'; '.join(birth_errors)}" + ) + + pop_df = None + population_errors = [] + for candidate_acs_year in [acs_year, acs_year - 1, acs_year - 2]: + try: + pop_df = extract_female_population(candidate_acs_year) + break + except Exception as e: + population_errors.append(f"{candidate_acs_year}: {e}") + logger.warning(f"ACS {candidate_acs_year} not available for take-up: {e}") + if pop_df is None: + raise RuntimeError( + "No ACS female population data for pregnancy take-up rates. " + f"Tried {acs_year}, {acs_year - 1}, and {acs_year - 2}: " + f"{'; '.join(population_errors)}" + ) + df = transform_pregnancy_data(births_df, pop_df) return dict(zip(df["state_abbrev"], df["pregnancy_rate"])) diff --git a/tests/unit/db/test_etl_pregnancy.py b/tests/unit/db/test_etl_pregnancy.py index 1f61492a7..69651177b 100644 --- a/tests/unit/db/test_etl_pregnancy.py +++ b/tests/unit/db/test_etl_pregnancy.py @@ -1,3 +1,5 @@ +import pandas as pd + from policyengine_us_data.db import etl_pregnancy @@ -124,3 +126,37 @@ def fake_get(url, **kwargs): ), ] assert saved == [("census_b01001_female_15_44_2023.json", b01001_payload())] + + +def test_get_state_pregnancy_rates_falls_back_to_available_years(monkeypatch): + calls = [] + + def fake_extract_cdc_births(year): + calls.append(("births", year)) + if year == 2024: + raise RuntimeError("CDC unavailable") + return pd.DataFrame({"state_abbrev": ["AL"], "births": [52_000]}) + + def fake_extract_female_population(year): + calls.append(("population", year)) + if year in (2024, 2023): + raise RuntimeError("ACS unavailable") + return pd.DataFrame({"state_abbrev": ["AL"], "female_15_44": [1_000_000]}) + + monkeypatch.setattr(etl_pregnancy, "extract_cdc_births", fake_extract_cdc_births) + monkeypatch.setattr( + etl_pregnancy, + "extract_female_population", + fake_extract_female_population, + ) + + rates = etl_pregnancy.get_state_pregnancy_rates(cdc_year=2024, acs_year=2024) + + assert calls == [ + ("births", 2024), + ("births", 2023), + ("population", 2024), + ("population", 2023), + ("population", 2022), + ] + assert rates == {"AL": 0.039}