diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 54e2d306..971e3371 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -13,7 +13,7 @@ fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, geotiff,2026-05-06,1500,HIGH,3,"Pass 3 (2026-05-06): HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. (Pass 3 also opened #1499 for predictor=2 sample-wise decode but it was a duplicate of already-merged #1498 -- closed.) | Pass 2 (2026-05-05) PR #1498: predictor=2 was byte-wise at sample stride, multi-byte integer files written by libtiff/GDAL silently decoded to wrong values; now sample-wise per TIFF spec on CPU and GPU. | Pass 1 (2026-04-23) PR #1247: HIGH fixed CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: BigTIFF writers emit LONG8 strip/tile offsets (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." -geotiff,2026-05-06,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads ("incomplete or truncated stream"), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings. +geotiff,2026-05-06,1498,HIGH,1;5,"HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads (""incomplete or truncated stream""), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings." glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues. @@ -35,3 +35,4 @@ terrain_metrics,2026-04-30,,LOW,2;5,"LOW: Inf input not rejected, propagates as visibility,2026-04-13T12:00:00Z,,,,"Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy." worley,2026-05-01,,MEDIUM,2;5,"MEDIUM: numpy backend uses np.empty_like(data) so integer input dtype produces integer output (distances truncated to 0); cupy/dask paths always produce float32. LOW: freq=inf produces 100000 sentinel (sqrt of initial min_dist=1e10), no validation of freq/seed for non-finite values." zonal,2026-03-30T12:00:00Z,1090,,, +geotiff,2026-05-06,pending-pr-cache-stale,MEDIUM,5,"Pass 5 (2026-05-06): MEDIUM fixed in fix-mmap-cache-stale-inode -- _MmapCache returned stale bytes after the file at the same path was replaced (e.g. by to_geotiff which writes via tempfile + os.replace, swapping the inode). Reproduces on numpy and GPU read paths: write A, read A, write B at same path, read returns A. Fixed by storing (st_ino, st_size, st_mtime_ns) per cache entry and dropping the entry on acquire when the file ident has changed; in-use mmaps stay valid for current holders. Tests added for both atomic-rename replacement and in-place truncate-and-rewrite. | Pass 4 (2026-05-06) PR #1501: sparse COG tiles (TileByteCounts==0). | Pass 3 (2026-05-06) PR #1500: predictor=3 byte-order. | Pass 2 (2026-05-05) PR #1498: predictor=2 sample-wise. | Pass 1 (2026-04-23) PR #1247. Re-confirmed clean: south-up flipping is a documented limitation in test_georef_edges.test_y_coords_known_limitation; CRS round-trip 4326/3857/32633/5070 exact; LERC int default lossless; NaN bit patterns survive predictor=3 round-trip; RATIONAL den=0 returns 0.0; ASCII tag NUL termination spec-correct on write; adler32 in GPU deflate path is per-tile not per-chunk so chunk-combine math does not apply; GeoDoubleParams uses correct double-array offset semantics. Deferred: per-band different NODATA via GDAL_METADATA exposed in attrs but not applied (single dataset-level value used for masking)." diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d3805ad..a5b6115c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Unreleased #### Bug fixes and improvements +- Refresh the geotiff mmap cache when a file is replaced under the same path so re-reads after an atomic-rename overwrite no longer return stale bytes - Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index a7ce2359..2aa8548b 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -82,17 +82,44 @@ class _MmapCache: def __init__(self, max_size: int | None = None): self._lock = threading.Lock() - # path -> [fh, mm, size, refcount] (list so we can mutate in place) - # OrderedDict gives LRU semantics via move_to_end on access. + # path -> [fh, mm, size, refcount, ident] + # ``ident`` is (st_ino, st_size, st_mtime_ns) used to spot files that + # were replaced (e.g. via ``os.replace`` on an atomic write) at the + # same path. ``list`` so we can mutate in place; ``OrderedDict`` gives + # LRU semantics via move_to_end. self._entries: OrderedDict[str, list] = OrderedDict() self._max_size = (max_size if max_size is not None else _mmap_cache_size_from_env()) + @staticmethod + def _file_ident(path: str): + """Return a (st_ino, st_size, st_mtime_ns) tuple for *path* or None.""" + try: + st = _os_module.stat(path) + except OSError: + return None + return (st.st_ino, st.st_size, st.st_mtime_ns) + def acquire(self, path: str): """Get or create a read-only mmap for *path*. Returns (mm, size).""" real = _os_module.path.realpath(path) with self._lock: entry = self._entries.get(real) + ident = self._file_ident(real) + if entry is not None: + # If the file at this path has been replaced (different inode, + # size, or mtime) the cached mmap is stale. Drop the entry so + # we re-open below. If the old entry is still in use by other + # callers, leave their mmap valid -- they still hold a + # reference -- but unlink the cache slot so it isn't reused. + if ident is not None and entry[4] != ident: + self._entries.pop(real) + if entry[3] <= 0: + if entry[1] is not None: + entry[1].close() + entry[0].close() + entry = None + if entry is not None: entry[3] += 1 self._entries.move_to_end(real) @@ -106,7 +133,9 @@ def acquire(self, path: str): mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ) else: mm = None - self._entries[real] = [fh, mm, size, 1] + # Re-stat after opening so size matches the mmap we built. + ident = self._file_ident(real) or (0, size, 0) + self._entries[real] = [fh, mm, size, 1, ident] self._evict_locked() return mm, size @@ -142,7 +171,7 @@ def _evict_locked(self): to_drop.append(key) for key in to_drop: entry = self._entries.pop(key) - _, mm, _, _ = entry + mm = entry[1] if mm is not None: mm.close() entry[0].close() @@ -152,7 +181,7 @@ def clear(self): with self._lock: for key in [k for k, v in self._entries.items() if v[3] <= 0]: entry = self._entries.pop(key) - _, mm, _, _ = entry + mm = entry[1] if mm is not None: mm.close() entry[0].close() diff --git a/xrspatial/geotiff/tests/test_polish_1488.py b/xrspatial/geotiff/tests/test_polish_1488.py index e9c7ad76..0ed304fe 100644 --- a/xrspatial/geotiff/tests/test_polish_1488.py +++ b/xrspatial/geotiff/tests/test_polish_1488.py @@ -17,6 +17,7 @@ from __future__ import annotations import os +import sys import warnings import numpy as np @@ -193,6 +194,97 @@ def test_env_var_override(self, tmp_path, monkeypatch): cache = _MmapCache() assert cache._max_size == 5 + def test_fingerprint_mismatch_invalidates_idle_entry(self, tmp_path): + """Platform-agnostic check that the invalidation logic itself + runs. The end-to-end ``os.replace`` and truncate scenarios are + POSIX-only because Windows blocks the precondition, but the + fingerprint comparison is the actual line of code that prevents + stale reads -- exercise it directly by mutating the cached + fingerprint to simulate a file that changed underneath us.""" + cache = _MmapCache() + p = tmp_path / 'fingerprint_1488.bin' + p.write_bytes(b'A' * 16) + + mm1, _ = cache.acquire(str(p)) + cache.release(str(p)) + + real = os.path.realpath(str(p)) + # Force the cached fingerprint to a value the file cannot match, + # which mimics any real-world change (replace, truncate, rename + # back) without depending on the OS allowing those operations + # while we hold an mmap. + cache._entries[real][4] = (-1, -1, -1) + + # Next acquire must notice the mismatch and re-mmap. The mm + # object identity should change; the new entry's fingerprint + # should match the live file. + mm2, _ = cache.acquire(str(p)) + try: + assert mm2 is not mm1 + assert cache._entries[real][4] == cache._file_ident(real) + finally: + cache.release(str(p)) + + @pytest.mark.skipif( + sys.platform.startswith('win'), + reason="Windows holds a file lock while a path is mmapped, " + "so os.replace fails before the cache lookup runs. " + "The stale-cache bug this test guards against can only " + "occur on POSIX.", + ) + def test_replaced_file_invalidates_idle_entry(self, tmp_path): + """``os.replace`` swaps the inode under a stable path. The cache + must spot the new inode and re-mmap rather than serve stale bytes. + Reproduces a real ``to_geotiff`` round-trip pattern -- the writer + does an atomic rename, so a re-read after overwriting the same + path used to return data from the unlinked old file.""" + cache = _MmapCache() + p = tmp_path / 'replaced_1488.bin' + p.write_bytes(b'A' * 16) + + mm1, _ = cache.acquire(str(p)) + first = bytes(mm1[:16]) + assert first == b'A' * 16 + cache.release(str(p)) + + # Atomic-rename style replacement: same path, new inode. + new = tmp_path / 'replaced_1488.bin.tmp' + new.write_bytes(b'B' * 16) + os.replace(str(new), str(p)) + + mm2, _ = cache.acquire(str(p)) + assert bytes(mm2[:16]) == b'B' * 16 + cache.release(str(p)) + + @pytest.mark.skipif( + sys.platform.startswith('win'), + reason="Windows blocks open(path, 'wb') while the path is " + "mmapped, so the in-place rewrite fails before the " + "cache invalidation check. POSIX-only scenario.", + ) + def test_truncated_file_invalidates_idle_entry(self, tmp_path): + """In-place truncate-and-overwrite (``open(p, 'wb')``) preserves the + inode but changes size and mtime, so the cache must still + invalidate.""" + cache = _MmapCache() + p = tmp_path / 'truncate_1488.bin' + p.write_bytes(b'A' * 32) + + mm1, sz1 = cache.acquire(str(p)) + assert sz1 == 32 + cache.release(str(p)) + + # In-place rewrite with a smaller payload. + import time + time.sleep(0.01) # ensure st_mtime_ns advances + with open(str(p), 'wb') as fh: + fh.write(b'C' * 8) + + mm2, sz2 = cache.acquire(str(p)) + assert sz2 == 8 + assert bytes(mm2[:8]) == b'C' * 8 + cache.release(str(p)) + # --------------------------------------------------------------------------- # P-4: decode parallelism threshold