From f78f538a0c333324f1974af5138be637f3cbc274 Mon Sep 17 00:00:00 2001 From: Michael Smit Date: Wed, 21 May 2025 20:56:04 -0700 Subject: [PATCH] Always check for a new data file version even if one has been downloaded. Related to PolicyEngine/issues#350 The existing code is pretty inconsistent in terms of how/when it decides to try to download a data file and we haven't clearly defined the intended behavior. We are prioritizing the simulation API use case in which case we always want to use the most recent version of the data file for a simulation. This change means that if the code specifies a remote data file (either by explicitly giving a url or by defaulting to a country dataset) we will always check for a new version when creating a Simulation object even if we have a local copy. --- changelog_entry.yaml | 4 ++++ policyengine/simulation.py | 15 +++++++-------- policyengine/utils/data_download.py | 4 ---- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..3c43bbd7 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Always look for new data file versions even if we have a local copy of one. diff --git a/policyengine/simulation.py b/policyengine/simulation.py index f40d5d29..aa5858a9 100644 --- a/policyengine/simulation.py +++ b/policyengine/simulation.py @@ -135,14 +135,13 @@ def _set_data(self): -1 ].split("/", 2) - if not Path(filename).exists(): - file_path = download( - filepath=filename, - huggingface_org=hf_org, - huggingface_repo=hf_repo, - gcs_bucket=bucket, - ) - filename = str(Path(file_path)) + file_path = download( + filepath=filename, + huggingface_org=hf_org, + huggingface_repo=hf_repo, + gcs_bucket=bucket, + ) + filename = str(Path(file_path)) if "cps_2023" in filename: time_period = 2023 else: diff --git a/policyengine/utils/data_download.py b/policyengine/utils/data_download.py index c7722173..5b0f776a 100644 --- a/policyengine/utils/data_download.py +++ b/policyengine/utils/data_download.py @@ -40,10 +40,6 @@ def download( except: logging.info("Failed to download from Hugging Face.") - if Path(filepath).exists(): - logging.info(f"File {filepath} already exists. Skipping download.") - return filepath - if data_file.gcs_bucket is not None: logging.info("Using Google Cloud Storage for download.") download_file_from_gcs(