From a76ddc2db4ae50694f0658f0cf7c587a054cfcf4 Mon Sep 17 00:00:00 2001 From: Jeremy Zilar Date: Fri, 1 May 2026 11:15:58 -0400 Subject: [PATCH 1/3] Add migration script to populate NMBGMR site names as alternate IDs The legacy Location.csv has a SiteNames column that was never transferred into the ThingIdLink table. This left site_name null for all wells in the API response. The script reads SiteNames from the CSV and inserts NMBGMR ThingIdLink rows for all matched wells. It is idempotent and safe to re-run after future well transfers. --- transfers/migrate_nmbgmr_site_names.py | 111 +++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 transfers/migrate_nmbgmr_site_names.py diff --git a/transfers/migrate_nmbgmr_site_names.py b/transfers/migrate_nmbgmr_site_names.py new file mode 100644 index 00000000..95c567a0 --- /dev/null +++ b/transfers/migrate_nmbgmr_site_names.py @@ -0,0 +1,111 @@ +""" +One-time data migration: populate NMBGMR site names as ThingIdLink records. + +The legacy Location.csv has a SiteNames column with the human-readable site +name assigned by NMBGMR (e.g. "Zwager domestic", "Pendaries Village Well #1"). +This value was never transferred into the ThingIdLink table, so the site_name +property on Thing always returned None. + +This script is idempotent: it skips any (thing_id, NMBGMR, alternate_id) row +that already exists. + +Usage (from repo root, with venv active): + python -m transfers.migrate_nmbgmr_site_names +""" + +import logging + +import pandas as pd +from sqlalchemy import insert, select, tuple_ + +from db import Thing, ThingIdLink +from db.engine import session_ctx +from transfers.util import get_transfers_data_path + +logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") +logger = logging.getLogger(__name__) + +ALTERNATE_ORGANIZATION = "NMBGMR" +RELATION = "same_as" +RELEASE_STATUS = "public" + + +def run(): + csv_path = get_transfers_data_path("nma_csv_cache/Location.csv") + logger.info("Reading %s", csv_path) + + df = pd.read_csv(csv_path, dtype=str, usecols=["PointID", "SiteNames"]) + df = df[ + df["SiteNames"].notna() + & (df["SiteNames"] != "NULL") + & (df["SiteNames"].str.strip() != "") + ].copy() + df["SiteNames"] = df["SiteNames"].str.strip() + logger.info("%d rows with a non-empty SiteNames value", len(df)) + + with session_ctx() as session: + # Build a PointID -> thing_id map for all matching wells in one query. + point_ids = df["PointID"].tolist() + thing_id_by_pointid: dict[str, int] = { + name: thing_id + for name, thing_id in session.execute( + select(Thing.name, Thing.id).where(Thing.name.in_(point_ids)) + ).all() + } + logger.info( + "%d / %d PointIDs matched a Thing in the database", + len(thing_id_by_pointid), + len(df), + ) + + # Build candidate rows. + candidates: list[dict] = [] + for row in df.itertuples(index=False): + thing_id = thing_id_by_pointid.get(row.PointID) + if thing_id is None: + continue + candidates.append( + { + "thing_id": thing_id, + "relation": RELATION, + "alternate_id": row.SiteNames, + "alternate_organization": ALTERNATE_ORGANIZATION, + "release_status": RELEASE_STATUS, + } + ) + + # Skip rows that already exist (idempotent). + existing_keys: set[tuple[int, str, str]] = set( + session.execute( + select( + ThingIdLink.thing_id, + ThingIdLink.alternate_organization, + ThingIdLink.alternate_id, + ).where( + ThingIdLink.alternate_organization == ALTERNATE_ORGANIZATION + ) + ).all() + ) + logger.info( + "%d NMBGMR ThingIdLink rows already in the database", len(existing_keys) + ) + + rows_to_insert = [ + r + for r in candidates + if (r["thing_id"], r["alternate_organization"], r["alternate_id"]) + not in existing_keys + ] + logger.info("%d new rows to insert", len(rows_to_insert)) + + if not rows_to_insert: + logger.info("Nothing to do.") + return + + session.execute(insert(ThingIdLink), rows_to_insert) + session.commit() + logger.info("Done. Inserted %d NMBGMR site name links.", len(rows_to_insert)) + + +if __name__ == "__main__": + run() From e66989273dd531b69ceb528574ef7b6ce9c58c41 Mon Sep 17 00:00:00 2001 From: jeremyzilar <395641+jeremyzilar@users.noreply.github.com> Date: Fri, 1 May 2026 15:16:43 +0000 Subject: [PATCH 2/3] Formatting changes --- transfers/migrate_nmbgmr_site_names.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/transfers/migrate_nmbgmr_site_names.py b/transfers/migrate_nmbgmr_site_names.py index 95c567a0..daa3f740 100644 --- a/transfers/migrate_nmbgmr_site_names.py +++ b/transfers/migrate_nmbgmr_site_names.py @@ -81,9 +81,7 @@ def run(): ThingIdLink.thing_id, ThingIdLink.alternate_organization, ThingIdLink.alternate_id, - ).where( - ThingIdLink.alternate_organization == ALTERNATE_ORGANIZATION - ) + ).where(ThingIdLink.alternate_organization == ALTERNATE_ORGANIZATION) ).all() ) logger.info( From 48305f20462fee9add9aaeea37bcbbde77bdae7a Mon Sep 17 00:00:00 2001 From: Jeremy Zilar Date: Fri, 1 May 2026 12:00:41 -0400 Subject: [PATCH 3/3] Use LocationId/nma_pk_location for NMBGMR site name matching PointID is not unique across all rows in Location.csv (MB-1005 appears twice with different SiteNames). Switch to matching LocationId against Thing.nma_pk_location, which is the UUID primary key from NM_Aquifer and has higher fidelity. Suggested by jacob-a-brown in PR #668. --- transfers/migrate_nmbgmr_site_names.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/transfers/migrate_nmbgmr_site_names.py b/transfers/migrate_nmbgmr_site_names.py index daa3f740..2325b383 100644 --- a/transfers/migrate_nmbgmr_site_names.py +++ b/transfers/migrate_nmbgmr_site_names.py @@ -34,7 +34,7 @@ def run(): csv_path = get_transfers_data_path("nma_csv_cache/Location.csv") logger.info("Reading %s", csv_path) - df = pd.read_csv(csv_path, dtype=str, usecols=["PointID", "SiteNames"]) + df = pd.read_csv(csv_path, dtype=str, usecols=["LocationId", "SiteNames"]) df = df[ df["SiteNames"].notna() & (df["SiteNames"] != "NULL") @@ -44,24 +44,29 @@ def run(): logger.info("%d rows with a non-empty SiteNames value", len(df)) with session_ctx() as session: - # Build a PointID -> thing_id map for all matching wells in one query. - point_ids = df["PointID"].tolist() - thing_id_by_pointid: dict[str, int] = { - name: thing_id - for name, thing_id in session.execute( - select(Thing.name, Thing.id).where(Thing.name.in_(point_ids)) + # Match on LocationId -> nma_pk_location rather than PointID -> name. + # PointID is not unique across all Location rows; LocationId (the UUID + # primary key from NM_Aquifer) has higher fidelity. Suggested by + # jacob-a-brown in PR #668. + location_ids = df["LocationId"].tolist() + thing_id_by_location_id: dict[str, int] = { + location_id: thing_id + for location_id, thing_id in session.execute( + select(Thing.nma_pk_location, Thing.id).where( + Thing.nma_pk_location.in_(location_ids) + ) ).all() } logger.info( - "%d / %d PointIDs matched a Thing in the database", - len(thing_id_by_pointid), + "%d / %d LocationIds matched a Thing in the database", + len(thing_id_by_location_id), len(df), ) # Build candidate rows. candidates: list[dict] = [] for row in df.itertuples(index=False): - thing_id = thing_id_by_pointid.get(row.PointID) + thing_id = thing_id_by_location_id.get(row.LocationId) if thing_id is None: continue candidates.append(