From 7a01e2cccb1a93558ce6fbef2ca44904d4ecca64 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 6 May 2026 18:59:00 -0700 Subject: [PATCH 1/3] Refresh geotiff mmap cache when the file at a path is replaced The shared `_MmapCache` keyed entries by realpath and never re-stat'd after the first open. `to_geotiff` writes via `tempfile.mkstemp` + `os.replace`, which keeps the path stable but swaps the inode. A process that read a TIFF, overwrote it via `to_geotiff`, then read again would get bytes from the unlinked old file -- the cache served the prior mmap because the realpath still matched. Same hazard hits in-place truncate-and-rewrite from any external tool. Cache entries now carry `(st_ino, st_size, st_mtime_ns)`; `acquire` stat's the path and drops the cached entry when the ident has changed. Idle entries are closed; in-use mmaps stay valid for their current holders (they still own a reference) but are unlinked from the cache so subsequent `acquire` calls re-mmap the new file. Tests cover atomic-rename replacement and in-place truncate overwrite. --- .claude/sweep-accuracy-state.csv | 3 +- CHANGELOG.md | 1 + xrspatial/geotiff/_reader.py | 39 ++++++++++++++--- xrspatial/geotiff/tests/test_polish_1488.py | 47 +++++++++++++++++++++ 4 files changed, 84 insertions(+), 6 deletions(-) diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 54e2d306..971e3371 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -13,7 +13,7 @@ fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, geotiff,2026-05-06,1500,HIGH,3,"Pass 3 (2026-05-06): HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. (Pass 3 also opened #1499 for predictor=2 sample-wise decode but it was a duplicate of already-merged #1498 -- closed.) | Pass 2 (2026-05-05) PR #1498: predictor=2 was byte-wise at sample stride, multi-byte integer files written by libtiff/GDAL silently decoded to wrong values; now sample-wise per TIFF spec on CPU and GPU. | Pass 1 (2026-04-23) PR #1247: HIGH fixed CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: BigTIFF writers emit LONG8 strip/tile offsets (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." -geotiff,2026-05-06,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads ("incomplete or truncated stream"), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings. +geotiff,2026-05-06,1498,HIGH,1;5,"HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads (""incomplete or truncated stream""), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings." glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues. @@ -35,3 +35,4 @@ terrain_metrics,2026-04-30,,LOW,2;5,"LOW: Inf input not rejected, propagates as visibility,2026-04-13T12:00:00Z,,,,"Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy." worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so integer input dtype produces integer output (distances truncated to 0); cupy/dask paths always produce float32. LOW: freq=inf produces 100000 sentinel (sqrt of initial min_dist=1e10), no validation of freq/seed for non-finite values." zonal,2026-03-30T12:00:00Z,1090,,, +geotiff,2026-05-06,pending-pr-cache-stale,MEDIUM,5,"Pass 5 (2026-05-06): MEDIUM fixed in fix-mmap-cache-stale-inode -- _MmapCache returned stale bytes after the file at the same path was replaced (e.g. by to_geotiff which writes via tempfile + os.replace, swapping the inode). Reproduces on numpy and GPU read paths: write A, read A, write B at same path, read returns A. Fixed by storing (st_ino, st_size, st_mtime_ns) per cache entry and dropping the entry on acquire when the file ident has changed; in-use mmaps stay valid for current holders. Tests added for both atomic-rename replacement and in-place truncate-and-rewrite. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles (TileByteCounts==0). | Pass 3 (2026-05-06) PR #1500: predictor=3 byte-order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247. Re-confirmed clean: south-up flipping is a documented limitation in test_georef_edges.test_y_coords_known_limitation; CRS round-trip 4326/3857/32633/5070 exact; LERC int default lossless; NaN bit patterns survive predictor=3 round-trip; RATIONAL den=0 returns 0.0; ASCII tag NUL termination spec-correct on write; adler32 in GPU deflate path is per-tile not per-chunk so chunk-combine math does not apply; GeoDoubleParams uses correct double-array offset semantics. Deferred: per-band different NODATA via GDAL_METADATA exposed in attrs but not applied (single dataset-level value used for masking)." diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d3805ad..a5b6115c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Unreleased #### Bug fixes and improvements +- Refresh the geotiff mmap cache when a file is replaced under the same path so re-reads after an atomic-rename overwrite no longer return stale bytes - Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index a7ce2359..2aa8548b 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -82,17 +82,44 @@ class _MmapCache: def __init__(self, max_size: int | None = None): self._lock = threading.Lock() - # path -> [fh, mm, size, refcount] (list so we can mutate in place) - # OrderedDict gives LRU semantics via move_to_end on access. + # path -> [fh, mm, size, refcount, ident] + # ``ident`` is (st_ino, st_size, st_mtime_ns) used to spot files that + # were replaced (e.g. via ``os.replace`` on an atomic write) at the + # same path. ``list`` so we can mutate in place; ``OrderedDict`` gives + # LRU semantics via move_to_end. self._entries: OrderedDict[str, list] = OrderedDict() self._max_size = (max_size if max_size is not None else _mmap_cache_size_from_env()) + @staticmethod + def _file_ident(path: str): + """Return a (st_ino, st_size, st_mtime_ns) tuple for *path* or None.""" + try: + st = _os_module.stat(path) + except OSError: + return None + return (st.st_ino, st.st_size, st.st_mtime_ns) + def acquire(self, path: str): """Get or create a read-only mmap for *path*. Returns (mm, size).""" real = _os_module.path.realpath(path) with self._lock: entry = self._entries.get(real) + ident = self._file_ident(real) + if entry is not None: + # If the file at this path has been replaced (different inode, + # size, or mtime) the cached mmap is stale. Drop the entry so + # we re-open below. If the old entry is still in use by other + # callers, leave their mmap valid -- they still hold a + # reference -- but unlink the cache slot so it isn't reused. + if ident is not None and entry[4] != ident: + self._entries.pop(real) + if entry[3] <= 0: + if entry[1] is not None: + entry[1].close() + entry[0].close() + entry = None + if entry is not None: entry[3] += 1 self._entries.move_to_end(real) @@ -106,7 +133,9 @@ def acquire(self, path: str): mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ) else: mm = None - self._entries[real] = [fh, mm, size, 1] + # Re-stat after opening so size matches the mmap we built. + ident = self._file_ident(real) or (0, size, 0) + self._entries[real] = [fh, mm, size, 1, ident] self._evict_locked() return mm, size @@ -142,7 +171,7 @@ def _evict_locked(self): to_drop.append(key) for key in to_drop: entry = self._entries.pop(key) - _, mm, _, _ = entry + mm = entry[1] if mm is not None: mm.close() entry[0].close() @@ -152,7 +181,7 @@ def clear(self): with self._lock: for key in [k for k, v in self._entries.items() if v[3] <= 0]: entry = self._entries.pop(key) - _, mm, _, _ = entry + mm = entry[1] if mm is not None: mm.close() entry[0].close() diff --git a/xrspatial/geotiff/tests/test_polish_1488.py b/xrspatial/geotiff/tests/test_polish_1488.py index e9c7ad76..1308fbae 100644 --- a/xrspatial/geotiff/tests/test_polish_1488.py +++ b/xrspatial/geotiff/tests/test_polish_1488.py @@ -193,6 +193,53 @@ def test_env_var_override(self, tmp_path, monkeypatch): cache = _MmapCache() assert cache._max_size == 5 + def test_replaced_file_invalidates_idle_entry(self, tmp_path): + """``os.replace`` swaps the inode under a stable path. The cache + must spot the new inode and re-mmap rather than serve stale bytes. + Reproduces a real ``to_geotiff`` round-trip pattern -- the writer + does an atomic rename, so a re-read after overwriting the same + path used to return data from the unlinked old file.""" + cache = _MmapCache() + p = tmp_path / 'replaced_1488.bin' + p.write_bytes(b'A' * 16) + + mm1, _ = cache.acquire(str(p)) + first = bytes(mm1[:16]) + assert first == b'A' * 16 + cache.release(str(p)) + + # Atomic-rename style replacement: same path, new inode. + new = tmp_path / 'replaced_1488.bin.tmp' + new.write_bytes(b'B' * 16) + os.replace(str(new), str(p)) + + mm2, _ = cache.acquire(str(p)) + assert bytes(mm2[:16]) == b'B' * 16 + cache.release(str(p)) + + def test_truncated_file_invalidates_idle_entry(self, tmp_path): + """In-place truncate-and-overwrite (``open(p, 'wb')``) preserves the + inode but changes size and mtime, so the cache must still + invalidate.""" + cache = _MmapCache() + p = tmp_path / 'truncate_1488.bin' + p.write_bytes(b'A' * 32) + + mm1, sz1 = cache.acquire(str(p)) + assert sz1 == 32 + cache.release(str(p)) + + # In-place rewrite with a smaller payload. + import time + time.sleep(0.01) # ensure st_mtime_ns advances + with open(str(p), 'wb') as fh: + fh.write(b'C' * 8) + + mm2, sz2 = cache.acquire(str(p)) + assert sz2 == 8 + assert bytes(mm2[:8]) == b'C' * 8 + cache.release(str(p)) + # --------------------------------------------------------------------------- # P-4: decode parallelism threshold From 4fcec411f4945cfcbc2117a3995b3091b0cddff0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 7 May 2026 06:06:22 -0700 Subject: [PATCH 2/3] Skip POSIX-only mmap cache tests on Windows Windows holds a file lock while a path is mmapped, so os.replace and open(path, 'wb') fail before the cache invalidation check runs. The stale-cache bug these tests guard against cannot occur on Windows because the OS prevents the precondition. --- xrspatial/geotiff/tests/test_polish_1488.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/xrspatial/geotiff/tests/test_polish_1488.py b/xrspatial/geotiff/tests/test_polish_1488.py index 1308fbae..dca09afe 100644 --- a/xrspatial/geotiff/tests/test_polish_1488.py +++ b/xrspatial/geotiff/tests/test_polish_1488.py @@ -17,6 +17,7 @@ from __future__ import annotations import os +import sys import warnings import numpy as np @@ -193,6 +194,13 @@ def test_env_var_override(self, tmp_path, monkeypatch): cache = _MmapCache() assert cache._max_size == 5 + @pytest.mark.skipif( + sys.platform.startswith('win'), + reason="Windows holds a file lock while a path is mmapped, " + "so os.replace fails before the cache lookup runs. " + "The stale-cache bug this test guards against can only " + "occur on POSIX.", + ) def test_replaced_file_invalidates_idle_entry(self, tmp_path): """``os.replace`` swaps the inode under a stable path. The cache must spot the new inode and re-mmap rather than serve stale bytes. @@ -217,6 +225,12 @@ def test_replaced_file_invalidates_idle_entry(self, tmp_path): assert bytes(mm2[:16]) == b'B' * 16 cache.release(str(p)) + @pytest.mark.skipif( + sys.platform.startswith('win'), + reason="Windows blocks open(path, 'wb') while the path is " + "mmapped, so the in-place rewrite fails before the " + "cache invalidation check. POSIX-only scenario.", + ) def test_truncated_file_invalidates_idle_entry(self, tmp_path): """In-place truncate-and-overwrite (``open(p, 'wb')``) preserves the inode but changes size and mtime, so the cache must still From 12892f7e65f7e293dbe436f041cb4dfc3cc5ff91 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 7 May 2026 06:08:30 -0700 Subject: [PATCH 3/3] Add platform-agnostic test for mmap cache fingerprint invalidation The two scenario tests (replace, truncate) are skipped on Windows because the OS blocks the precondition. That left the actual fingerprint check with no Windows coverage, so a future refactor that drops the check could regress silently on Windows CI. Add a unit test that mutates the stored fingerprint directly to simulate a changed file. Runs on every platform and pins down the exact line of code that prevents stale reads. --- xrspatial/geotiff/tests/test_polish_1488.py | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/xrspatial/geotiff/tests/test_polish_1488.py b/xrspatial/geotiff/tests/test_polish_1488.py index dca09afe..0ed304fe 100644 --- a/xrspatial/geotiff/tests/test_polish_1488.py +++ b/xrspatial/geotiff/tests/test_polish_1488.py @@ -194,6 +194,37 @@ def test_env_var_override(self, tmp_path, monkeypatch): cache = _MmapCache() assert cache._max_size == 5 + def test_fingerprint_mismatch_invalidates_idle_entry(self, tmp_path): + """Platform-agnostic check that the invalidation logic itself + runs. The end-to-end ``os.replace`` and truncate scenarios are + POSIX-only because Windows blocks the precondition, but the + fingerprint comparison is the actual line of code that prevents + stale reads -- exercise it directly by mutating the cached + fingerprint to simulate a file that changed underneath us.""" + cache = _MmapCache() + p = tmp_path / 'fingerprint_1488.bin' + p.write_bytes(b'A' * 16) + + mm1, _ = cache.acquire(str(p)) + cache.release(str(p)) + + real = os.path.realpath(str(p)) + # Force the cached fingerprint to a value the file cannot match, + # which mimics any real-world change (replace, truncate, rename + # back) without depending on the OS allowing those operations + # while we hold an mmap. + cache._entries[real][4] = (-1, -1, -1) + + # Next acquire must notice the mismatch and re-mmap. The mm + # object identity should change; the new entry's fingerprint + # should match the live file. + mm2, _ = cache.acquire(str(p)) + try: + assert mm2 is not mm1 + assert cache._entries[real][4] == cache._file_ident(real) + finally: + cache.release(str(p)) + @pytest.mark.skipif( sys.platform.startswith('win'), reason="Windows holds a file lock while a path is mmapped, "