Skip to content

Commit 48305f2

Browse files
committed
Use LocationId/nma_pk_location for NMBGMR site name matching
PointID is not unique across all rows in Location.csv (MB-1005 appears twice with different SiteNames). Switch to matching LocationId against Thing.nma_pk_location, which is the UUID primary key from NM_Aquifer and has higher fidelity. Suggested by jacob-a-brown in PR #668.
1 parent e669892 commit 48305f2

1 file changed

Lines changed: 15 additions & 10 deletions

File tree

transfers/migrate_nmbgmr_site_names.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def run():
3434
csv_path = get_transfers_data_path("nma_csv_cache/Location.csv")
3535
logger.info("Reading %s", csv_path)
3636

37-
df = pd.read_csv(csv_path, dtype=str, usecols=["PointID", "SiteNames"])
37+
df = pd.read_csv(csv_path, dtype=str, usecols=["LocationId", "SiteNames"])
3838
df = df[
3939
df["SiteNames"].notna()
4040
& (df["SiteNames"] != "NULL")
@@ -44,24 +44,29 @@ def run():
4444
logger.info("%d rows with a non-empty SiteNames value", len(df))
4545

4646
with session_ctx() as session:
47-
# Build a PointID -> thing_id map for all matching wells in one query.
48-
point_ids = df["PointID"].tolist()
49-
thing_id_by_pointid: dict[str, int] = {
50-
name: thing_id
51-
for name, thing_id in session.execute(
52-
select(Thing.name, Thing.id).where(Thing.name.in_(point_ids))
47+
# Match on LocationId -> nma_pk_location rather than PointID -> name.
48+
# PointID is not unique across all Location rows; LocationId (the UUID
49+
# primary key from NM_Aquifer) has higher fidelity. Suggested by
50+
# jacob-a-brown in PR #668.
51+
location_ids = df["LocationId"].tolist()
52+
thing_id_by_location_id: dict[str, int] = {
53+
location_id: thing_id
54+
for location_id, thing_id in session.execute(
55+
select(Thing.nma_pk_location, Thing.id).where(
56+
Thing.nma_pk_location.in_(location_ids)
57+
)
5358
).all()
5459
}
5560
logger.info(
56-
"%d / %d PointIDs matched a Thing in the database",
57-
len(thing_id_by_pointid),
61+
"%d / %d LocationIds matched a Thing in the database",
62+
len(thing_id_by_location_id),
5863
len(df),
5964
)
6065

6166
# Build candidate rows.
6267
candidates: list[dict] = []
6368
for row in df.itertuples(index=False):
64-
thing_id = thing_id_by_pointid.get(row.PointID)
69+
thing_id = thing_id_by_location_id.get(row.LocationId)
6570
if thing_id is None:
6671
continue
6772
candidates.append(

0 commit comments

Comments
 (0)